diff --git a/.gitattributes b/.gitattributes index 2c70b4a4c7a5edc6e2694ca124fc22ed2d8a3bde..42840a62ddfa1d2fced8ca2aba8d024304dbe874 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5029,3 +5029,12 @@ Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_ Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-0/checkpoint-752/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-0/checkpoint-903/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-0/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec1ef8d47b111af6b72ef86a5e420ba55ab736f6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18d3065462d28b00e94897958b444dda35b3dd73c28df887bc7c6cfe0ea65800 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e27047f51527938d40ac0db632858ab8c40f6335 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cc8a0bc46271dd7b4c992c6f7d21a4b68a961d125b50bb54dc061d1ecd4cef +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..81a8cd298ea89ea511d69aa8264de6dd53db9161 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1355012e59d01cca6936cab9109c2731c32dc7c95d4fd18cca06b60d907884d5 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b223759d202d46badee5d55abac6365814f4eb2d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26b99e05be79bd0f1b9ff6d769fd825f009768e072fbd1fcea8e5a6d49a34ede +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f69df0e45093107ef86f42d9b7bfb2e3333706a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8073b3be73946b85bc167aa6d6e91a7136066a38112f637602354d6a08b16a98 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c4fd1b46ecfd47235c7688033b7e3d52d77ceab --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/trainer_state.json @@ -0,0 +1,7727 @@ +{ + "best_metric": 1.1534006595611572, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", + "epoch": 7.997066373303997, + "eval_steps": 10, + "global_step": 10904, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007334066740007334, + "grad_norm": 0.47521963715553284, + "learning_rate": 0.0002, + "loss": 1.9722, + "step": 10 + }, + { + "epoch": 0.014668133480014669, + "grad_norm": 0.5395162105560303, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 20 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 0.4305780231952667, + "learning_rate": 0.0002, + "loss": 1.4202, + "step": 30 + }, + { + "epoch": 0.029336266960029337, + "grad_norm": 0.6938246488571167, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 40 + }, + { + "epoch": 0.03667033370003667, + "grad_norm": 1.5133819580078125, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 50 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 0.9173883199691772, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 60 + }, + { + "epoch": 0.05133846718005134, + "grad_norm": 0.4619861841201782, + "learning_rate": 0.0002, + "loss": 1.2844, + "step": 70 + }, + { + "epoch": 0.058672533920058674, + "grad_norm": 0.46118637919425964, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 80 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 0.4468648135662079, + "learning_rate": 0.0002, + "loss": 1.3441, + "step": 90 + }, + { + "epoch": 0.07334066740007333, + "grad_norm": 0.46123769879341125, + "learning_rate": 0.0002, + "loss": 1.1863, + "step": 100 + }, + { + "epoch": 0.08067473414008068, + "grad_norm": 0.4859139025211334, + "learning_rate": 0.0002, + "loss": 1.2772, + "step": 110 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 0.4384922385215759, + "learning_rate": 0.0002, + "loss": 1.2087, + "step": 120 + }, + { + "epoch": 0.09534286762009535, + "grad_norm": 0.39519360661506653, + "learning_rate": 0.0002, + "loss": 1.2927, + "step": 130 + }, + { + "epoch": 0.10267693436010268, + "grad_norm": 0.4049859344959259, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 140 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 0.4605638086795807, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 150 + }, + { + "epoch": 0.11734506784011735, + "grad_norm": 0.4201928377151489, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 160 + }, + { + "epoch": 0.12467913458012468, + "grad_norm": 0.5367777347564697, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 170 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 0.41752299666404724, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 180 + }, + { + "epoch": 0.13934726806013933, + "grad_norm": 0.31597763299942017, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 190 + }, + { + "epoch": 0.14668133480014667, + "grad_norm": 0.7468788623809814, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 200 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 0.3403034508228302, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 210 + }, + { + "epoch": 0.16134946828016136, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.1686835350201687, + "grad_norm": 0.356158971786499, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 230 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 0.3448857367038727, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 240 + }, + { + "epoch": 0.18335166850018336, + "grad_norm": 0.3475699722766876, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 250 + }, + { + "epoch": 0.1906857352401907, + "grad_norm": 0.2770358622074127, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 260 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.4310270845890045, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 270 + }, + { + "epoch": 0.20535386872020536, + "grad_norm": 0.335041880607605, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 280 + }, + { + "epoch": 0.2126879354602127, + "grad_norm": 0.3420602083206177, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 290 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 0.325001060962677, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 300 + }, + { + "epoch": 0.22735606894022736, + "grad_norm": 0.3027827739715576, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 310 + }, + { + "epoch": 0.2346901356802347, + "grad_norm": 0.435550719499588, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 320 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 0.3884522616863251, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 330 + }, + { + "epoch": 0.24935826916024936, + "grad_norm": 0.7736002206802368, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 340 + }, + { + "epoch": 0.2566923359002567, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0002, + "loss": 1.3606, + "step": 350 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.3311890959739685, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 360 + }, + { + "epoch": 0.27136046938027136, + "grad_norm": 0.7473500370979309, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 370 + }, + { + "epoch": 0.27869453612027867, + "grad_norm": 0.3681875765323639, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 380 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 0.3764737844467163, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 390 + }, + { + "epoch": 0.29336266960029334, + "grad_norm": 0.4243989586830139, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 400 + }, + { + "epoch": 0.3006967363403007, + "grad_norm": 0.2658531963825226, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 410 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 0.3436793386936188, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.31536486982031536, + "grad_norm": 0.5101129412651062, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 430 + }, + { + "epoch": 0.3226989365603227, + "grad_norm": 0.3319750726222992, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 440 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.385148286819458, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 450 + }, + { + "epoch": 0.3373670700403374, + "grad_norm": 0.3477935791015625, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 460 + }, + { + "epoch": 0.3447011367803447, + "grad_norm": 0.29748716950416565, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 470 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 0.34083324670791626, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 480 + }, + { + "epoch": 0.35936927026035936, + "grad_norm": 0.36904552578926086, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 490 + }, + { + "epoch": 0.3667033370003667, + "grad_norm": 0.315483033657074, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 500 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.44897955656051636, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 510 + }, + { + "epoch": 0.3813714704803814, + "grad_norm": 0.3160701394081116, + "learning_rate": 0.0002, + "loss": 1.3035, + "step": 520 + }, + { + "epoch": 0.3887055372203887, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 530 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5430002808570862, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 540 + }, + { + "epoch": 0.40337367070040336, + "grad_norm": 0.2908070683479309, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 550 + }, + { + "epoch": 0.4107077374404107, + "grad_norm": 0.35066530108451843, + "learning_rate": 0.0002, + "loss": 1.2384, + "step": 560 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.37588003277778625, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 570 + }, + { + "epoch": 0.4253758709204254, + "grad_norm": 0.3112126886844635, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 580 + }, + { + "epoch": 0.4327099376604327, + "grad_norm": 0.35577139258384705, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 590 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.31706422567367554, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 600 + }, + { + "epoch": 0.44737807114044736, + "grad_norm": 0.3249092102050781, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 610 + }, + { + "epoch": 0.4547121378804547, + "grad_norm": 0.3842705488204956, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 620 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.390991747379303, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 630 + }, + { + "epoch": 0.4693802713604694, + "grad_norm": 0.27532413601875305, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 640 + }, + { + "epoch": 0.4767143381004767, + "grad_norm": 0.31412816047668457, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 650 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.32117101550102234, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.49138247158049136, + "grad_norm": 0.3810010254383087, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 670 + }, + { + "epoch": 0.4987165383204987, + "grad_norm": 0.36289164423942566, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 680 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.34458720684051514, + "learning_rate": 0.0002, + "loss": 1.2034, + "step": 690 + }, + { + "epoch": 0.5133846718005134, + "grad_norm": 0.32844600081443787, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 700 + }, + { + "epoch": 0.5207187385405208, + "grad_norm": 0.3144175708293915, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 710 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.3898887634277344, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 720 + }, + { + "epoch": 0.5353868720205354, + "grad_norm": 1.3220758438110352, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 730 + }, + { + "epoch": 0.5427209387605427, + "grad_norm": 0.3635874390602112, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 740 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.3138217628002167, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 750 + }, + { + "epoch": 0.5573890722405573, + "grad_norm": 0.4063207805156708, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.5647231389805647, + "grad_norm": 0.3926219940185547, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 770 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.31954652070999146, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 780 + }, + { + "epoch": 0.5793912724605794, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 790 + }, + { + "epoch": 0.5867253392005867, + "grad_norm": 0.643004834651947, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 800 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.3479592800140381, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 810 + }, + { + "epoch": 0.6013934726806014, + "grad_norm": 0.4684754014015198, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 820 + }, + { + "epoch": 0.6087275394206088, + "grad_norm": 0.3739790916442871, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 830 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.40884748101234436, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 840 + }, + { + "epoch": 0.6233956729006234, + "grad_norm": 0.9722164273262024, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 850 + }, + { + "epoch": 0.6307297396406307, + "grad_norm": 0.42992347478866577, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 860 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.36654195189476013, + "learning_rate": 0.0002, + "loss": 1.1339, + "step": 870 + }, + { + "epoch": 0.6453978731206454, + "grad_norm": 0.4113832116127014, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 880 + }, + { + "epoch": 0.6527319398606527, + "grad_norm": 0.2948838770389557, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 890 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.38330280780792236, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 900 + }, + { + "epoch": 0.6674000733406674, + "grad_norm": 0.4428867697715759, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 910 + }, + { + "epoch": 0.6747341400806748, + "grad_norm": 0.23659265041351318, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 920 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.323685884475708, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 930 + }, + { + "epoch": 0.6894022735606894, + "grad_norm": 0.39157727360725403, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 940 + }, + { + "epoch": 0.6967363403006968, + "grad_norm": 0.27189481258392334, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 950 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.529883861541748, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 960 + }, + { + "epoch": 0.7114044737807114, + "grad_norm": 0.34758689999580383, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 970 + }, + { + "epoch": 0.7187385405207187, + "grad_norm": 0.831749439239502, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 980 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.4438304007053375, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 990 + }, + { + "epoch": 0.7334066740007334, + "grad_norm": 0.33840006589889526, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 1000 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3454797863960266, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1010 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.38999441266059875, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 1020 + }, + { + "epoch": 0.7554088742207554, + "grad_norm": 0.2829911708831787, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1030 + }, + { + "epoch": 0.7627429409607628, + "grad_norm": 0.36918163299560547, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 1040 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3415680229663849, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 1050 + }, + { + "epoch": 0.7774110744407774, + "grad_norm": 0.2974182963371277, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 1060 + }, + { + "epoch": 0.7847451411807848, + "grad_norm": 0.3880919814109802, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 1070 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.33503302931785583, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 1080 + }, + { + "epoch": 0.7994132746607994, + "grad_norm": 0.3728407025337219, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1090 + }, + { + "epoch": 0.8067473414008067, + "grad_norm": 0.3509373664855957, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 1100 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.42228564620018005, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 1110 + }, + { + "epoch": 0.8214154748808215, + "grad_norm": 0.313467800617218, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 1120 + }, + { + "epoch": 0.8287495416208287, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 1130 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.43200382590293884, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 1140 + }, + { + "epoch": 0.8434176751008434, + "grad_norm": 0.3309599459171295, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 1150 + }, + { + "epoch": 0.8507517418408508, + "grad_norm": 0.3526846170425415, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1160 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 1.2722247838974, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 1170 + }, + { + "epoch": 0.8654198753208654, + "grad_norm": 0.34142059087753296, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 1180 + }, + { + "epoch": 0.8727539420608728, + "grad_norm": 0.3805823028087616, + "learning_rate": 0.0002, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.3931232690811157, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 1200 + }, + { + "epoch": 0.8874220755408874, + "grad_norm": 0.2937372624874115, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 1210 + }, + { + "epoch": 0.8947561422808947, + "grad_norm": 0.3757196366786957, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 1220 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.3502705991268158, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 1230 + }, + { + "epoch": 0.9094242757609095, + "grad_norm": 0.32758915424346924, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 1240 + }, + { + "epoch": 0.9167583425009168, + "grad_norm": 0.37199416756629944, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 1250 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.3551490604877472, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 1260 + }, + { + "epoch": 0.9314264759809314, + "grad_norm": 0.2859550714492798, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 1270 + }, + { + "epoch": 0.9387605427209388, + "grad_norm": 0.427990585565567, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 1280 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.33717992901802063, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 1290 + }, + { + "epoch": 0.9534286762009534, + "grad_norm": 0.30225634574890137, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 1300 + }, + { + "epoch": 0.9607627429409608, + "grad_norm": 0.385821133852005, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 1310 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.9754308764209755, + "grad_norm": 0.49987098574638367, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 1330 + }, + { + "epoch": 0.9827649431609827, + "grad_norm": 0.3842747211456299, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 1340 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.6274653673171997, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1350 + }, + { + "epoch": 0.9974330766409975, + "grad_norm": 0.5239808559417725, + "learning_rate": 0.0002, + "loss": 1.124, + "step": 1360 + }, + { + "epoch": 0.9996332966629996, + "eval_loss": 1.1822267770767212, + "eval_runtime": 32.7389, + "eval_samples_per_second": 13.165, + "eval_steps_per_second": 1.649, + "step": 1363 + }, + { + "epoch": 1.0047671433810048, + "grad_norm": 0.45311301946640015, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 1370 + }, + { + "epoch": 1.012101210121012, + "grad_norm": 0.29685574769973755, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1380 + }, + { + "epoch": 1.0194352768610195, + "grad_norm": 0.3290937840938568, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 1390 + }, + { + "epoch": 1.0267693436010268, + "grad_norm": 0.3801758587360382, + "learning_rate": 0.0002, + "loss": 1.0295, + "step": 1400 + }, + { + "epoch": 1.034103410341034, + "grad_norm": 0.794174313545227, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 1410 + }, + { + "epoch": 1.0414374770810415, + "grad_norm": 0.3854154646396637, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 1420 + }, + { + "epoch": 1.0487715438210488, + "grad_norm": 0.32702451944351196, + "learning_rate": 0.0002, + "loss": 1.0652, + "step": 1430 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 0.7815203666687012, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 1440 + }, + { + "epoch": 1.0634396773010635, + "grad_norm": 0.3087436854839325, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1450 + }, + { + "epoch": 1.0707737440410707, + "grad_norm": 0.3847602903842926, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 1460 + }, + { + "epoch": 1.0781078107810782, + "grad_norm": 0.3693031370639801, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1470 + }, + { + "epoch": 1.0854418775210855, + "grad_norm": 0.4111202359199524, + "learning_rate": 0.0002, + "loss": 1.0995, + "step": 1480 + }, + { + "epoch": 1.0927759442610927, + "grad_norm": 0.41452381014823914, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 1490 + }, + { + "epoch": 1.1001100110011002, + "grad_norm": 0.3336445093154907, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 1500 + }, + { + "epoch": 1.1074440777411074, + "grad_norm": 0.3923407793045044, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 1510 + }, + { + "epoch": 1.1147781444811147, + "grad_norm": 0.46215683221817017, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 1520 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.3592156767845154, + "learning_rate": 0.0002, + "loss": 1.1133, + "step": 1530 + }, + { + "epoch": 1.1294462779611294, + "grad_norm": 0.361110657453537, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 1540 + }, + { + "epoch": 1.1367803447011369, + "grad_norm": 0.5317131280899048, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 1550 + }, + { + "epoch": 1.1441144114411441, + "grad_norm": 0.3882388174533844, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 1560 + }, + { + "epoch": 1.1514484781811514, + "grad_norm": 0.3259428143501282, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 1570 + }, + { + "epoch": 1.1587825449211588, + "grad_norm": 0.410935640335083, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 1580 + }, + { + "epoch": 1.166116611661166, + "grad_norm": 0.44940185546875, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 1590 + }, + { + "epoch": 1.1734506784011733, + "grad_norm": 0.5106484293937683, + "learning_rate": 0.0002, + "loss": 1.0334, + "step": 1600 + }, + { + "epoch": 1.1807847451411808, + "grad_norm": 0.6603665947914124, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 1610 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.4799964129924774, + "learning_rate": 0.0002, + "loss": 1.1227, + "step": 1620 + }, + { + "epoch": 1.1954528786211955, + "grad_norm": 0.4389883279800415, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 1630 + }, + { + "epoch": 1.2027869453612028, + "grad_norm": 0.4188813269138336, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 1640 + }, + { + "epoch": 1.21012101210121, + "grad_norm": 0.7132157683372498, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 1650 + }, + { + "epoch": 1.2174550788412175, + "grad_norm": 0.507480263710022, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1660 + }, + { + "epoch": 1.2247891455812248, + "grad_norm": 0.9452332854270935, + "learning_rate": 0.0002, + "loss": 0.9948, + "step": 1670 + }, + { + "epoch": 1.2321232123212322, + "grad_norm": 0.4121614992618561, + "learning_rate": 0.0002, + "loss": 1.0228, + "step": 1680 + }, + { + "epoch": 1.2394572790612395, + "grad_norm": 0.34230247139930725, + "learning_rate": 0.0002, + "loss": 1.0366, + "step": 1690 + }, + { + "epoch": 1.2467913458012467, + "grad_norm": 0.4026208817958832, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 1700 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.46673697233200073, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1710 + }, + { + "epoch": 1.2614594792812615, + "grad_norm": 0.38349825143814087, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 1720 + }, + { + "epoch": 1.2687935460212687, + "grad_norm": 0.4049997627735138, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 1730 + }, + { + "epoch": 1.2761276127612762, + "grad_norm": 0.3417615294456482, + "learning_rate": 0.0002, + "loss": 0.9504, + "step": 1740 + }, + { + "epoch": 1.2834616795012834, + "grad_norm": 0.4277614951133728, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 1750 + }, + { + "epoch": 1.2907957462412907, + "grad_norm": 0.5864202976226807, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 1760 + }, + { + "epoch": 1.2981298129812981, + "grad_norm": 0.7097493410110474, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 1770 + }, + { + "epoch": 1.3054638797213054, + "grad_norm": 0.3145381212234497, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 1780 + }, + { + "epoch": 1.3127979464613129, + "grad_norm": 0.5116165280342102, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 1790 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 0.7469736337661743, + "learning_rate": 0.0002, + "loss": 1.0765, + "step": 1800 + }, + { + "epoch": 1.3274660799413276, + "grad_norm": 0.32272255420684814, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 1810 + }, + { + "epoch": 1.3348001466813348, + "grad_norm": 0.3534623086452484, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 1820 + }, + { + "epoch": 1.342134213421342, + "grad_norm": 0.36127907037734985, + "learning_rate": 0.0002, + "loss": 1.1628, + "step": 1830 + }, + { + "epoch": 1.3494682801613496, + "grad_norm": 0.4072401523590088, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 1840 + }, + { + "epoch": 1.3568023469013568, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 1850 + }, + { + "epoch": 1.364136413641364, + "grad_norm": 0.412883460521698, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 1860 + }, + { + "epoch": 1.3714704803813715, + "grad_norm": 0.3735875189304352, + "learning_rate": 0.0002, + "loss": 1.0265, + "step": 1870 + }, + { + "epoch": 1.3788045471213788, + "grad_norm": 0.39158159494400024, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 1880 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.44431769847869873, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 1890 + }, + { + "epoch": 1.3934726806013935, + "grad_norm": 0.37772801518440247, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 1900 + }, + { + "epoch": 1.4008067473414008, + "grad_norm": 0.4056641757488251, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 1910 + }, + { + "epoch": 1.408140814081408, + "grad_norm": 0.41612377762794495, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 1920 + }, + { + "epoch": 1.4154748808214155, + "grad_norm": 0.41153013706207275, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 1930 + }, + { + "epoch": 1.4228089475614227, + "grad_norm": 0.387845516204834, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1940 + }, + { + "epoch": 1.4301430143014302, + "grad_norm": 0.3809587061405182, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 1950 + }, + { + "epoch": 1.4374770810414375, + "grad_norm": 0.3625726103782654, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1960 + }, + { + "epoch": 1.444811147781445, + "grad_norm": 0.5294290781021118, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1970 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 0.39975494146347046, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 1980 + }, + { + "epoch": 1.4594792812614594, + "grad_norm": 0.4181167185306549, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 1990 + }, + { + "epoch": 1.466813348001467, + "grad_norm": 0.42001503705978394, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 2000 + }, + { + "epoch": 1.4741474147414741, + "grad_norm": 0.4877578616142273, + "learning_rate": 0.0002, + "loss": 1.1266, + "step": 2010 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.4050969183444977, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 2020 + }, + { + "epoch": 1.4888155482214889, + "grad_norm": 0.39068883657455444, + "learning_rate": 0.0002, + "loss": 1.0562, + "step": 2030 + }, + { + "epoch": 1.4961496149614961, + "grad_norm": 0.421282559633255, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 2040 + }, + { + "epoch": 1.5034836817015034, + "grad_norm": 0.47092297673225403, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 2050 + }, + { + "epoch": 1.5108177484415108, + "grad_norm": 0.39688974618911743, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 2060 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.5529879331588745, + "learning_rate": 0.0002, + "loss": 1.08, + "step": 2070 + }, + { + "epoch": 1.5254858819215253, + "grad_norm": 0.4879782199859619, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 2080 + }, + { + "epoch": 1.5328199486615328, + "grad_norm": 0.5517361164093018, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 2090 + }, + { + "epoch": 1.5401540154015403, + "grad_norm": 0.44015637040138245, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2100 + }, + { + "epoch": 1.5474880821415475, + "grad_norm": 0.5435167551040649, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 2110 + }, + { + "epoch": 1.5548221488815548, + "grad_norm": 0.5714033246040344, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 2120 + }, + { + "epoch": 1.5621562156215623, + "grad_norm": 0.31732529401779175, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 2130 + }, + { + "epoch": 1.5694902823615695, + "grad_norm": 0.49068278074264526, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 2140 + }, + { + "epoch": 1.5768243491015768, + "grad_norm": 0.46851542592048645, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 2150 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.5083092451095581, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 2160 + }, + { + "epoch": 1.5914924825815915, + "grad_norm": 0.9822936058044434, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 2170 + }, + { + "epoch": 1.5988265493215987, + "grad_norm": 0.4575989246368408, + "learning_rate": 0.0002, + "loss": 0.9986, + "step": 2180 + }, + { + "epoch": 1.6061606160616062, + "grad_norm": 0.47444286942481995, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 2190 + }, + { + "epoch": 1.6134946828016135, + "grad_norm": 0.7208226919174194, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 2200 + }, + { + "epoch": 1.6208287495416207, + "grad_norm": 0.43791481852531433, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 2210 + }, + { + "epoch": 1.6281628162816282, + "grad_norm": 0.5245792865753174, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 2220 + }, + { + "epoch": 1.6354968830216357, + "grad_norm": 0.39289429783821106, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 2230 + }, + { + "epoch": 1.6428309497616427, + "grad_norm": 0.6106135845184326, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 2240 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.3722580671310425, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 2250 + }, + { + "epoch": 1.6574990832416576, + "grad_norm": 0.3649403750896454, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2260 + }, + { + "epoch": 1.6648331499816649, + "grad_norm": 0.46514248847961426, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 2270 + }, + { + "epoch": 1.6721672167216721, + "grad_norm": 0.42034927010536194, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 2280 + }, + { + "epoch": 1.6795012834616796, + "grad_norm": 0.45202910900115967, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 2290 + }, + { + "epoch": 1.6868353502016868, + "grad_norm": 0.36257603764533997, + "learning_rate": 0.0002, + "loss": 1.0866, + "step": 2300 + }, + { + "epoch": 1.694169416941694, + "grad_norm": 0.6340323090553284, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 2310 + }, + { + "epoch": 1.7015034836817016, + "grad_norm": 0.4352878928184509, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 2320 + }, + { + "epoch": 1.7088375504217088, + "grad_norm": 0.45029792189598083, + "learning_rate": 0.0002, + "loss": 1.0629, + "step": 2330 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.3891315758228302, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 2340 + }, + { + "epoch": 1.7235056839017235, + "grad_norm": 0.35180050134658813, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2350 + }, + { + "epoch": 1.7308397506417308, + "grad_norm": 0.42367449402809143, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 2360 + }, + { + "epoch": 1.738173817381738, + "grad_norm": 0.4553675353527069, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 2370 + }, + { + "epoch": 1.7455078841217455, + "grad_norm": 0.5944654941558838, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 2380 + }, + { + "epoch": 1.752841950861753, + "grad_norm": 0.3479664623737335, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 2390 + }, + { + "epoch": 1.76017601760176, + "grad_norm": 0.3585502505302429, + "learning_rate": 0.0002, + "loss": 1.0798, + "step": 2400 + }, + { + "epoch": 1.7675100843417675, + "grad_norm": 0.4263346493244171, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 2410 + }, + { + "epoch": 1.774844151081775, + "grad_norm": 0.5476409196853638, + "learning_rate": 0.0002, + "loss": 1.054, + "step": 2420 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 0.3694186508655548, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 2430 + }, + { + "epoch": 1.7895122845617895, + "grad_norm": 0.9185658693313599, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 2440 + }, + { + "epoch": 1.796846351301797, + "grad_norm": 0.7171908020973206, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2450 + }, + { + "epoch": 1.8041804180418042, + "grad_norm": 0.550658643245697, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 2460 + }, + { + "epoch": 1.8115144847818114, + "grad_norm": 0.4075568914413452, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2470 + }, + { + "epoch": 1.818848551521819, + "grad_norm": 0.3790127635002136, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 2480 + }, + { + "epoch": 1.8261826182618262, + "grad_norm": 0.3576384484767914, + "learning_rate": 0.0002, + "loss": 0.9839, + "step": 2490 + }, + { + "epoch": 1.8335166850018334, + "grad_norm": 0.3919370770454407, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 2500 + }, + { + "epoch": 1.8408507517418409, + "grad_norm": 0.485083669424057, + "learning_rate": 0.0002, + "loss": 0.9985, + "step": 2510 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.4564347565174103, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 2520 + }, + { + "epoch": 1.8555188852218554, + "grad_norm": 0.3613106608390808, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 2530 + }, + { + "epoch": 1.8628529519618628, + "grad_norm": 0.39600759744644165, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 2540 + }, + { + "epoch": 1.8701870187018703, + "grad_norm": 1.123499870300293, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 2550 + }, + { + "epoch": 1.8775210854418776, + "grad_norm": 0.4612680673599243, + "learning_rate": 0.0002, + "loss": 1.0635, + "step": 2560 + }, + { + "epoch": 1.8848551521818848, + "grad_norm": 0.42745399475097656, + "learning_rate": 0.0002, + "loss": 1.0087, + "step": 2570 + }, + { + "epoch": 1.8921892189218923, + "grad_norm": 0.4055580198764801, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 2580 + }, + { + "epoch": 1.8995232856618995, + "grad_norm": 0.44174644351005554, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 2590 + }, + { + "epoch": 1.9068573524019068, + "grad_norm": 1.0228385925292969, + "learning_rate": 0.0002, + "loss": 0.9886, + "step": 2600 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.3496396243572235, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 2610 + }, + { + "epoch": 1.9215254858819215, + "grad_norm": 0.4191173017024994, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2620 + }, + { + "epoch": 1.9288595526219288, + "grad_norm": 0.6778554916381836, + "learning_rate": 0.0002, + "loss": 1.0943, + "step": 2630 + }, + { + "epoch": 1.9361936193619362, + "grad_norm": 0.41992834210395813, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 2640 + }, + { + "epoch": 1.9435276861019435, + "grad_norm": 0.8760401010513306, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 2650 + }, + { + "epoch": 1.9508617528419507, + "grad_norm": 0.44049209356307983, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 2660 + }, + { + "epoch": 1.9581958195819582, + "grad_norm": 0.5651928782463074, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 2670 + }, + { + "epoch": 1.9655298863219657, + "grad_norm": 0.5292727947235107, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 2680 + }, + { + "epoch": 1.9728639530619727, + "grad_norm": 0.6012240648269653, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 2690 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.3945149779319763, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 2700 + }, + { + "epoch": 1.9875320865419877, + "grad_norm": 0.5732627511024475, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 2710 + }, + { + "epoch": 1.994866153281995, + "grad_norm": 0.3963361084461212, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 2720 + }, + { + "epoch": 2.0, + "eval_loss": 1.1534006595611572, + "eval_runtime": 32.7541, + "eval_samples_per_second": 13.159, + "eval_steps_per_second": 1.649, + "step": 2727 + }, + { + "epoch": 2.002200220022002, + "grad_norm": 0.48628315329551697, + "learning_rate": 0.0002, + "loss": 0.9624, + "step": 2730 + }, + { + "epoch": 2.0095342867620096, + "grad_norm": 0.413875013589859, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 2740 + }, + { + "epoch": 2.0168683535020167, + "grad_norm": 0.4988735616207123, + "learning_rate": 0.0002, + "loss": 0.965, + "step": 2750 + }, + { + "epoch": 2.024202420242024, + "grad_norm": 0.5634812712669373, + "learning_rate": 0.0002, + "loss": 0.9677, + "step": 2760 + }, + { + "epoch": 2.0315364869820316, + "grad_norm": 0.48302653431892395, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 2770 + }, + { + "epoch": 2.038870553722039, + "grad_norm": 0.49914175271987915, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 2780 + }, + { + "epoch": 2.046204620462046, + "grad_norm": 1.14039945602417, + "learning_rate": 0.0002, + "loss": 0.904, + "step": 2790 + }, + { + "epoch": 2.0535386872020536, + "grad_norm": 0.6359720826148987, + "learning_rate": 0.0002, + "loss": 0.9588, + "step": 2800 + }, + { + "epoch": 2.060872753942061, + "grad_norm": 0.4589158296585083, + "learning_rate": 0.0002, + "loss": 0.9031, + "step": 2810 + }, + { + "epoch": 2.068206820682068, + "grad_norm": 0.46255481243133545, + "learning_rate": 0.0002, + "loss": 0.9438, + "step": 2820 + }, + { + "epoch": 2.0755408874220755, + "grad_norm": 0.6232137680053711, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 2830 + }, + { + "epoch": 2.082874954162083, + "grad_norm": 0.41042178869247437, + "learning_rate": 0.0002, + "loss": 0.8978, + "step": 2840 + }, + { + "epoch": 2.09020902090209, + "grad_norm": 0.5334428548812866, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 2850 + }, + { + "epoch": 2.0975430876420975, + "grad_norm": 0.8270058631896973, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 2860 + }, + { + "epoch": 2.104877154382105, + "grad_norm": 0.6624533534049988, + "learning_rate": 0.0002, + "loss": 1.0064, + "step": 2870 + }, + { + "epoch": 2.112211221122112, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0002, + "loss": 0.9196, + "step": 2880 + }, + { + "epoch": 2.1195452878621195, + "grad_norm": 0.621482789516449, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 2890 + }, + { + "epoch": 2.126879354602127, + "grad_norm": 0.4556255340576172, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2900 + }, + { + "epoch": 2.1342134213421344, + "grad_norm": 0.4620579183101654, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 2910 + }, + { + "epoch": 2.1415474880821415, + "grad_norm": 0.9602415561676025, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2920 + }, + { + "epoch": 2.148881554822149, + "grad_norm": 0.587943971157074, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 2930 + }, + { + "epoch": 2.1562156215621564, + "grad_norm": 0.5121372938156128, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 2940 + }, + { + "epoch": 2.1635496883021634, + "grad_norm": 0.49424484372138977, + "learning_rate": 0.0002, + "loss": 0.8751, + "step": 2950 + }, + { + "epoch": 2.170883755042171, + "grad_norm": 0.6312560439109802, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2960 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.5235576629638672, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2970 + }, + { + "epoch": 2.1855518885221854, + "grad_norm": 0.5868439674377441, + "learning_rate": 0.0002, + "loss": 0.9706, + "step": 2980 + }, + { + "epoch": 2.192885955262193, + "grad_norm": 0.42302873730659485, + "learning_rate": 0.0002, + "loss": 0.9338, + "step": 2990 + }, + { + "epoch": 2.2002200220022003, + "grad_norm": 0.5097725987434387, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 3000 + }, + { + "epoch": 2.2075540887422074, + "grad_norm": 0.5091572403907776, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 3010 + }, + { + "epoch": 2.214888155482215, + "grad_norm": 0.49433162808418274, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 3020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5577368140220642, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 3030 + }, + { + "epoch": 2.2295562889622293, + "grad_norm": 0.6177583932876587, + "learning_rate": 0.0002, + "loss": 0.9033, + "step": 3040 + }, + { + "epoch": 2.236890355702237, + "grad_norm": 0.5256719589233398, + "learning_rate": 0.0002, + "loss": 0.9882, + "step": 3050 + }, + { + "epoch": 2.2442244224422443, + "grad_norm": 0.5001118183135986, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3060 + }, + { + "epoch": 2.2515584891822513, + "grad_norm": 0.5721249580383301, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3070 + }, + { + "epoch": 2.258892555922259, + "grad_norm": 0.5325384140014648, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 3080 + }, + { + "epoch": 2.2662266226622663, + "grad_norm": 0.5719189047813416, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 3090 + }, + { + "epoch": 2.2735606894022737, + "grad_norm": 0.6337835788726807, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 3100 + }, + { + "epoch": 2.2808947561422808, + "grad_norm": 0.5381836891174316, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 3110 + }, + { + "epoch": 2.2882288228822882, + "grad_norm": 0.5408531427383423, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 3120 + }, + { + "epoch": 2.2955628896222957, + "grad_norm": 0.43705281615257263, + "learning_rate": 0.0002, + "loss": 1.0325, + "step": 3130 + }, + { + "epoch": 2.3028969563623027, + "grad_norm": 0.6454030275344849, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 3140 + }, + { + "epoch": 2.31023102310231, + "grad_norm": 0.686030387878418, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 3150 + }, + { + "epoch": 2.3175650898423177, + "grad_norm": 0.5123633146286011, + "learning_rate": 0.0002, + "loss": 0.9403, + "step": 3160 + }, + { + "epoch": 2.3248991565823247, + "grad_norm": 0.842506468296051, + "learning_rate": 0.0002, + "loss": 0.8834, + "step": 3170 + }, + { + "epoch": 2.332233223322332, + "grad_norm": 0.5193818807601929, + "learning_rate": 0.0002, + "loss": 1.0497, + "step": 3180 + }, + { + "epoch": 2.3395672900623397, + "grad_norm": 0.5634409189224243, + "learning_rate": 0.0002, + "loss": 0.9473, + "step": 3190 + }, + { + "epoch": 2.3469013568023467, + "grad_norm": 0.6475534439086914, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 3200 + }, + { + "epoch": 2.354235423542354, + "grad_norm": 1.1503914594650269, + "learning_rate": 0.0002, + "loss": 0.874, + "step": 3210 + }, + { + "epoch": 2.3615694902823616, + "grad_norm": 0.7234905362129211, + "learning_rate": 0.0002, + "loss": 0.9762, + "step": 3220 + }, + { + "epoch": 2.368903557022369, + "grad_norm": 0.664903461933136, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 3230 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.5453006625175476, + "learning_rate": 0.0002, + "loss": 0.9987, + "step": 3240 + }, + { + "epoch": 2.3835716905023836, + "grad_norm": 0.6256654262542725, + "learning_rate": 0.0002, + "loss": 0.9742, + "step": 3250 + }, + { + "epoch": 2.390905757242391, + "grad_norm": 0.5166565179824829, + "learning_rate": 0.0002, + "loss": 0.9922, + "step": 3260 + }, + { + "epoch": 2.398239823982398, + "grad_norm": 0.5699098110198975, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 3270 + }, + { + "epoch": 2.4055738907224056, + "grad_norm": 0.4472540020942688, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 3280 + }, + { + "epoch": 2.412907957462413, + "grad_norm": 0.6790403127670288, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3290 + }, + { + "epoch": 2.42024202420242, + "grad_norm": 0.5182185173034668, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 3300 + }, + { + "epoch": 2.4275760909424275, + "grad_norm": 0.564647912979126, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 3310 + }, + { + "epoch": 2.434910157682435, + "grad_norm": 0.5625313520431519, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 3320 + }, + { + "epoch": 2.442244224422442, + "grad_norm": 0.7496559619903564, + "learning_rate": 0.0002, + "loss": 0.8798, + "step": 3330 + }, + { + "epoch": 2.4495782911624495, + "grad_norm": 0.4779128134250641, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 2.456912357902457, + "grad_norm": 0.578093409538269, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 3350 + }, + { + "epoch": 2.4642464246424645, + "grad_norm": 0.5456080436706543, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 3360 + }, + { + "epoch": 2.4715804913824715, + "grad_norm": 0.4769273102283478, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 3370 + }, + { + "epoch": 2.478914558122479, + "grad_norm": 0.5608189702033997, + "learning_rate": 0.0002, + "loss": 0.9312, + "step": 3380 + }, + { + "epoch": 2.4862486248624864, + "grad_norm": 0.5590165853500366, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 3390 + }, + { + "epoch": 2.4935826916024935, + "grad_norm": 0.801306962966919, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 3400 + }, + { + "epoch": 2.500916758342501, + "grad_norm": 0.6045624613761902, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 3410 + }, + { + "epoch": 2.5082508250825084, + "grad_norm": 0.5735858082771301, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 3420 + }, + { + "epoch": 2.5155848918225154, + "grad_norm": 0.6827309131622314, + "learning_rate": 0.0002, + "loss": 0.9846, + "step": 3430 + }, + { + "epoch": 2.522918958562523, + "grad_norm": 0.5702602863311768, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 3440 + }, + { + "epoch": 2.5302530253025304, + "grad_norm": 0.6674721240997314, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 3450 + }, + { + "epoch": 2.5375870920425374, + "grad_norm": 0.5635907649993896, + "learning_rate": 0.0002, + "loss": 0.914, + "step": 3460 + }, + { + "epoch": 2.544921158782545, + "grad_norm": 0.42737770080566406, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 3470 + }, + { + "epoch": 2.5522552255225524, + "grad_norm": 0.6720691919326782, + "learning_rate": 0.0002, + "loss": 0.9474, + "step": 3480 + }, + { + "epoch": 2.55958929226256, + "grad_norm": 0.8917084336280823, + "learning_rate": 0.0002, + "loss": 0.8637, + "step": 3490 + }, + { + "epoch": 2.566923359002567, + "grad_norm": 0.5134549140930176, + "learning_rate": 0.0002, + "loss": 0.9257, + "step": 3500 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.4951367974281311, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 3510 + }, + { + "epoch": 2.5815914924825814, + "grad_norm": 0.9438204765319824, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 3520 + }, + { + "epoch": 2.588925559222589, + "grad_norm": 0.6024714708328247, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 3530 + }, + { + "epoch": 2.5962596259625963, + "grad_norm": 0.5248535871505737, + "learning_rate": 0.0002, + "loss": 0.9298, + "step": 3540 + }, + { + "epoch": 2.6035936927026038, + "grad_norm": 0.8677568435668945, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 3550 + }, + { + "epoch": 2.610927759442611, + "grad_norm": 0.82008296251297, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 3560 + }, + { + "epoch": 2.6182618261826183, + "grad_norm": 0.4724634885787964, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 3570 + }, + { + "epoch": 2.6255958929226257, + "grad_norm": 0.5434244275093079, + "learning_rate": 0.0002, + "loss": 0.9058, + "step": 3580 + }, + { + "epoch": 2.6329299596626328, + "grad_norm": 0.4948740005493164, + "learning_rate": 0.0002, + "loss": 0.9379, + "step": 3590 + }, + { + "epoch": 2.6402640264026402, + "grad_norm": 0.42109328508377075, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3600 + }, + { + "epoch": 2.6475980931426477, + "grad_norm": 0.7979786396026611, + "learning_rate": 0.0002, + "loss": 0.9809, + "step": 3610 + }, + { + "epoch": 2.654932159882655, + "grad_norm": 0.6345919370651245, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 3620 + }, + { + "epoch": 2.662266226622662, + "grad_norm": 0.4971671402454376, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 3630 + }, + { + "epoch": 2.6696002933626697, + "grad_norm": 0.6467748284339905, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 3640 + }, + { + "epoch": 2.6769343601026767, + "grad_norm": 0.4240160286426544, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 3650 + }, + { + "epoch": 2.684268426842684, + "grad_norm": 0.5179754495620728, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3660 + }, + { + "epoch": 2.6916024935826917, + "grad_norm": 0.754012405872345, + "learning_rate": 0.0002, + "loss": 0.9221, + "step": 3670 + }, + { + "epoch": 2.698936560322699, + "grad_norm": 0.5141299962997437, + "learning_rate": 0.0002, + "loss": 0.9194, + "step": 3680 + }, + { + "epoch": 2.706270627062706, + "grad_norm": 0.5737819075584412, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 3690 + }, + { + "epoch": 2.7136046938027136, + "grad_norm": 0.5887577533721924, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 3700 + }, + { + "epoch": 2.720938760542721, + "grad_norm": 0.6740471720695496, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 3710 + }, + { + "epoch": 2.728272827282728, + "grad_norm": 0.5879453420639038, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 3720 + }, + { + "epoch": 2.7356068940227356, + "grad_norm": 0.4858354926109314, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 3730 + }, + { + "epoch": 2.742940960762743, + "grad_norm": 0.5489001870155334, + "learning_rate": 0.0002, + "loss": 0.9308, + "step": 3740 + }, + { + "epoch": 2.7502750275027505, + "grad_norm": 0.8187092542648315, + "learning_rate": 0.0002, + "loss": 0.894, + "step": 3750 + }, + { + "epoch": 2.7576090942427576, + "grad_norm": 0.5666626691818237, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 3760 + }, + { + "epoch": 2.764943160982765, + "grad_norm": 0.5377066135406494, + "learning_rate": 0.0002, + "loss": 1.0059, + "step": 3770 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.566330075263977, + "learning_rate": 0.0002, + "loss": 0.9132, + "step": 3780 + }, + { + "epoch": 2.7796112944627795, + "grad_norm": 0.5522832870483398, + "learning_rate": 0.0002, + "loss": 0.9415, + "step": 3790 + }, + { + "epoch": 2.786945361202787, + "grad_norm": 0.5668695569038391, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 3800 + }, + { + "epoch": 2.7942794279427945, + "grad_norm": 0.7566602826118469, + "learning_rate": 0.0002, + "loss": 0.8885, + "step": 3810 + }, + { + "epoch": 2.8016134946828015, + "grad_norm": 0.5603684782981873, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 3820 + }, + { + "epoch": 2.808947561422809, + "grad_norm": 0.49122217297554016, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 3830 + }, + { + "epoch": 2.816281628162816, + "grad_norm": 0.6798251867294312, + "learning_rate": 0.0002, + "loss": 0.9738, + "step": 3840 + }, + { + "epoch": 2.8236156949028235, + "grad_norm": 0.6097991466522217, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 3850 + }, + { + "epoch": 2.830949761642831, + "grad_norm": 0.6675726175308228, + "learning_rate": 0.0002, + "loss": 0.8672, + "step": 3860 + }, + { + "epoch": 2.8382838283828384, + "grad_norm": 0.9223952889442444, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 3870 + }, + { + "epoch": 2.8456178951228455, + "grad_norm": 0.6020799875259399, + "learning_rate": 0.0002, + "loss": 0.8767, + "step": 3880 + }, + { + "epoch": 2.852951961862853, + "grad_norm": 0.5206381678581238, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3890 + }, + { + "epoch": 2.8602860286028604, + "grad_norm": 0.6268777251243591, + "learning_rate": 0.0002, + "loss": 0.9479, + "step": 3900 + }, + { + "epoch": 2.8676200953428674, + "grad_norm": 1.1583497524261475, + "learning_rate": 0.0002, + "loss": 0.9409, + "step": 3910 + }, + { + "epoch": 2.874954162082875, + "grad_norm": 0.7263903021812439, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 3920 + }, + { + "epoch": 2.8822882288228824, + "grad_norm": 0.5369910001754761, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 3930 + }, + { + "epoch": 2.88962229556289, + "grad_norm": 0.7298350930213928, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 3940 + }, + { + "epoch": 2.896956362302897, + "grad_norm": 0.577012836933136, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 3950 + }, + { + "epoch": 2.9042904290429044, + "grad_norm": 0.5859594345092773, + "learning_rate": 0.0002, + "loss": 0.9716, + "step": 3960 + }, + { + "epoch": 2.9116244957829114, + "grad_norm": 0.47176122665405273, + "learning_rate": 0.0002, + "loss": 0.8772, + "step": 3970 + }, + { + "epoch": 2.918958562522919, + "grad_norm": 0.9699620604515076, + "learning_rate": 0.0002, + "loss": 0.8997, + "step": 3980 + }, + { + "epoch": 2.9262926292629263, + "grad_norm": 0.7908747792243958, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3990 + }, + { + "epoch": 2.933626696002934, + "grad_norm": 0.5777379274368286, + "learning_rate": 0.0002, + "loss": 0.9462, + "step": 4000 + }, + { + "epoch": 2.940960762742941, + "grad_norm": 0.599288284778595, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 4010 + }, + { + "epoch": 2.9482948294829483, + "grad_norm": 0.5232274532318115, + "learning_rate": 0.0002, + "loss": 0.9812, + "step": 4020 + }, + { + "epoch": 2.9556288962229558, + "grad_norm": 0.6395137310028076, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 4030 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.589260458946228, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 4040 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.5699581503868103, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 4050 + }, + { + "epoch": 2.9776310964429777, + "grad_norm": 0.528468132019043, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 4060 + }, + { + "epoch": 2.984965163182985, + "grad_norm": 0.4804670512676239, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 4070 + }, + { + "epoch": 2.9922992299229922, + "grad_norm": 1.1918889284133911, + "learning_rate": 0.0002, + "loss": 0.9771, + "step": 4080 + }, + { + "epoch": 2.9996332966629997, + "grad_norm": 0.5479103326797485, + "learning_rate": 0.0002, + "loss": 0.9178, + "step": 4090 + }, + { + "epoch": 2.9996332966629997, + "eval_loss": 1.1642853021621704, + "eval_runtime": 32.7511, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.649, + "step": 4090 + }, + { + "epoch": 3.006967363403007, + "grad_norm": 0.7430027723312378, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 4100 + }, + { + "epoch": 3.014301430143014, + "grad_norm": 0.6293647289276123, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4110 + }, + { + "epoch": 3.0216354968830217, + "grad_norm": 0.6191329956054688, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 4120 + }, + { + "epoch": 3.028969563623029, + "grad_norm": 0.7959313988685608, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4130 + }, + { + "epoch": 3.036303630363036, + "grad_norm": 0.5956351161003113, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 4140 + }, + { + "epoch": 3.0436376971030437, + "grad_norm": 0.670383632183075, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 4150 + }, + { + "epoch": 3.050971763843051, + "grad_norm": 0.6414518356323242, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 4160 + }, + { + "epoch": 3.058305830583058, + "grad_norm": 0.7928852438926697, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 4170 + }, + { + "epoch": 3.0656398973230656, + "grad_norm": 0.6211121082305908, + "learning_rate": 0.0002, + "loss": 0.7914, + "step": 4180 + }, + { + "epoch": 3.072973964063073, + "grad_norm": 0.6237057447433472, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 4190 + }, + { + "epoch": 3.08030803080308, + "grad_norm": 0.6522233486175537, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 4200 + }, + { + "epoch": 3.0876420975430876, + "grad_norm": 0.9396848678588867, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4210 + }, + { + "epoch": 3.094976164283095, + "grad_norm": 0.8003010749816895, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 4220 + }, + { + "epoch": 3.102310231023102, + "grad_norm": 0.6733810305595398, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 4230 + }, + { + "epoch": 3.1096442977631096, + "grad_norm": 0.6365828514099121, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 4240 + }, + { + "epoch": 3.116978364503117, + "grad_norm": 1.0805548429489136, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4250 + }, + { + "epoch": 3.1243124312431245, + "grad_norm": 0.7262141108512878, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4260 + }, + { + "epoch": 3.1316464979831315, + "grad_norm": 0.5500539541244507, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 4270 + }, + { + "epoch": 3.138980564723139, + "grad_norm": 0.793912947177887, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 4280 + }, + { + "epoch": 3.1463146314631465, + "grad_norm": 1.2540518045425415, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 4290 + }, + { + "epoch": 3.1536486982031535, + "grad_norm": 0.7020077705383301, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 4300 + }, + { + "epoch": 3.160982764943161, + "grad_norm": 0.5111123323440552, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 4310 + }, + { + "epoch": 3.1683168316831685, + "grad_norm": 0.7172090411186218, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 4320 + }, + { + "epoch": 3.1756508984231755, + "grad_norm": 0.6343168616294861, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 4330 + }, + { + "epoch": 3.182984965163183, + "grad_norm": 0.9563672542572021, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4340 + }, + { + "epoch": 3.1903190319031904, + "grad_norm": 1.0225574970245361, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4350 + }, + { + "epoch": 3.1976530986431975, + "grad_norm": 1.1633386611938477, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 4360 + }, + { + "epoch": 3.204987165383205, + "grad_norm": 0.8915148973464966, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 4370 + }, + { + "epoch": 3.2123212321232124, + "grad_norm": 0.9156812429428101, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4380 + }, + { + "epoch": 3.21965529886322, + "grad_norm": 0.6363258957862854, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 4390 + }, + { + "epoch": 3.226989365603227, + "grad_norm": 0.579099178314209, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 4400 + }, + { + "epoch": 3.2343234323432344, + "grad_norm": 0.8778146505355835, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 4410 + }, + { + "epoch": 3.241657499083242, + "grad_norm": 0.8356770873069763, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 4420 + }, + { + "epoch": 3.248991565823249, + "grad_norm": 0.702032208442688, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 4430 + }, + { + "epoch": 3.2563256325632564, + "grad_norm": 0.6386539340019226, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 4440 + }, + { + "epoch": 3.263659699303264, + "grad_norm": 0.7008408904075623, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 4450 + }, + { + "epoch": 3.270993766043271, + "grad_norm": 0.9556332230567932, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 4460 + }, + { + "epoch": 3.2783278327832783, + "grad_norm": 0.5667835474014282, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 4470 + }, + { + "epoch": 3.285661899523286, + "grad_norm": 0.8239172697067261, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 4480 + }, + { + "epoch": 3.292995966263293, + "grad_norm": 0.7045050859451294, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 4490 + }, + { + "epoch": 3.3003300330033003, + "grad_norm": 0.7131434082984924, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 4500 + }, + { + "epoch": 3.3076640997433078, + "grad_norm": 0.6924910545349121, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 4510 + }, + { + "epoch": 3.3149981664833152, + "grad_norm": 0.8945356607437134, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 4520 + }, + { + "epoch": 3.3223322332233223, + "grad_norm": 0.6546903252601624, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 4530 + }, + { + "epoch": 3.3296662999633297, + "grad_norm": 0.8206679224967957, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4540 + }, + { + "epoch": 3.3370003667033368, + "grad_norm": 0.6482203602790833, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 4550 + }, + { + "epoch": 3.3443344334433442, + "grad_norm": 0.7558760046958923, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 4560 + }, + { + "epoch": 3.3516685001833517, + "grad_norm": 0.7794756889343262, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 4570 + }, + { + "epoch": 3.359002566923359, + "grad_norm": 0.7382805943489075, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4580 + }, + { + "epoch": 3.366336633663366, + "grad_norm": 0.5912511944770813, + "learning_rate": 0.0002, + "loss": 0.8511, + "step": 4590 + }, + { + "epoch": 3.3736707004033737, + "grad_norm": 0.7444885969161987, + "learning_rate": 0.0002, + "loss": 0.8272, + "step": 4600 + }, + { + "epoch": 3.381004767143381, + "grad_norm": 0.7354922890663147, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 4610 + }, + { + "epoch": 3.388338833883388, + "grad_norm": 0.7685934901237488, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 4620 + }, + { + "epoch": 3.3956729006233957, + "grad_norm": 0.61041259765625, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 4630 + }, + { + "epoch": 3.403006967363403, + "grad_norm": 0.6820451021194458, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 4640 + }, + { + "epoch": 3.41034103410341, + "grad_norm": 0.5819534063339233, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 4650 + }, + { + "epoch": 3.4176751008434176, + "grad_norm": 0.705410897731781, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 4660 + }, + { + "epoch": 3.425009167583425, + "grad_norm": 0.8052892088890076, + "learning_rate": 0.0002, + "loss": 0.7901, + "step": 4670 + }, + { + "epoch": 3.432343234323432, + "grad_norm": 0.7746483087539673, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 4680 + }, + { + "epoch": 3.4396773010634396, + "grad_norm": 0.7713689804077148, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 4690 + }, + { + "epoch": 3.447011367803447, + "grad_norm": 0.810371994972229, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 4700 + }, + { + "epoch": 3.4543454345434546, + "grad_norm": 0.7702969312667847, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4710 + }, + { + "epoch": 3.4616795012834616, + "grad_norm": 0.7069268822669983, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4720 + }, + { + "epoch": 3.469013568023469, + "grad_norm": 0.7640359401702881, + "learning_rate": 0.0002, + "loss": 0.8199, + "step": 4730 + }, + { + "epoch": 3.4763476347634765, + "grad_norm": 0.8661707639694214, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 4740 + }, + { + "epoch": 3.4836817015034836, + "grad_norm": 0.9970282912254333, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 4750 + }, + { + "epoch": 3.491015768243491, + "grad_norm": 0.5824355483055115, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 4760 + }, + { + "epoch": 3.4983498349834985, + "grad_norm": 1.3072649240493774, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 4770 + }, + { + "epoch": 3.5056839017235055, + "grad_norm": 0.873978316783905, + "learning_rate": 0.0002, + "loss": 0.9101, + "step": 4780 + }, + { + "epoch": 3.513017968463513, + "grad_norm": 0.5526657104492188, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4790 + }, + { + "epoch": 3.5203520352035205, + "grad_norm": 0.790894627571106, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 4800 + }, + { + "epoch": 3.5276861019435275, + "grad_norm": 0.8119630217552185, + "learning_rate": 0.0002, + "loss": 0.831, + "step": 4810 + }, + { + "epoch": 3.535020168683535, + "grad_norm": 0.633212149143219, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 4820 + }, + { + "epoch": 3.5423542354235424, + "grad_norm": 0.703029990196228, + "learning_rate": 0.0002, + "loss": 0.8505, + "step": 4830 + }, + { + "epoch": 3.54968830216355, + "grad_norm": 0.7603771686553955, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 4840 + }, + { + "epoch": 3.557022368903557, + "grad_norm": 0.6260480880737305, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 4850 + }, + { + "epoch": 3.5643564356435644, + "grad_norm": 0.8203664422035217, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 4860 + }, + { + "epoch": 3.5716905023835714, + "grad_norm": 0.7793813347816467, + "learning_rate": 0.0002, + "loss": 0.8821, + "step": 4870 + }, + { + "epoch": 3.579024569123579, + "grad_norm": 0.7667397260665894, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 4880 + }, + { + "epoch": 3.5863586358635864, + "grad_norm": 0.8198829889297485, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 4890 + }, + { + "epoch": 3.593692702603594, + "grad_norm": 0.7689233422279358, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 4900 + }, + { + "epoch": 3.601026769343601, + "grad_norm": 0.7870983481407166, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 4910 + }, + { + "epoch": 3.6083608360836084, + "grad_norm": 0.8133853077888489, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 4920 + }, + { + "epoch": 3.615694902823616, + "grad_norm": 1.308401346206665, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 4930 + }, + { + "epoch": 3.623028969563623, + "grad_norm": 0.7131121754646301, + "learning_rate": 0.0002, + "loss": 0.8494, + "step": 4940 + }, + { + "epoch": 3.6303630363036303, + "grad_norm": 0.6825910210609436, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 4950 + }, + { + "epoch": 3.637697103043638, + "grad_norm": 0.7254678606987, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4960 + }, + { + "epoch": 3.6450311697836453, + "grad_norm": 0.8045085072517395, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4970 + }, + { + "epoch": 3.6523652365236523, + "grad_norm": 0.6991777420043945, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 4980 + }, + { + "epoch": 3.6596993032636598, + "grad_norm": 0.7804713249206543, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 4990 + }, + { + "epoch": 3.667033370003667, + "grad_norm": 0.8525708317756653, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 5000 + }, + { + "epoch": 3.6743674367436743, + "grad_norm": 0.7959994673728943, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 5010 + }, + { + "epoch": 3.6817015034836817, + "grad_norm": 0.8103628158569336, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5020 + }, + { + "epoch": 3.689035570223689, + "grad_norm": 0.7517836093902588, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 5030 + }, + { + "epoch": 3.6963696369636962, + "grad_norm": 0.6878514289855957, + "learning_rate": 0.0002, + "loss": 0.8375, + "step": 5040 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 1.2371820211410522, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 5050 + }, + { + "epoch": 3.711037770443711, + "grad_norm": 0.6567103862762451, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 5060 + }, + { + "epoch": 3.718371837183718, + "grad_norm": 1.1254922151565552, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 5070 + }, + { + "epoch": 3.7257059039237257, + "grad_norm": 0.6796132326126099, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 5080 + }, + { + "epoch": 3.733039970663733, + "grad_norm": 0.7285300493240356, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5090 + }, + { + "epoch": 3.7403740374037406, + "grad_norm": 0.8931500911712646, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 5100 + }, + { + "epoch": 3.7477081041437477, + "grad_norm": 0.6256856918334961, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 5110 + }, + { + "epoch": 3.755042170883755, + "grad_norm": 0.79310142993927, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5120 + }, + { + "epoch": 3.762376237623762, + "grad_norm": 0.6594041585922241, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 5130 + }, + { + "epoch": 3.7697103043637696, + "grad_norm": 0.7029327750205994, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 5140 + }, + { + "epoch": 3.777044371103777, + "grad_norm": 0.5880070328712463, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 5150 + }, + { + "epoch": 3.7843784378437846, + "grad_norm": 0.7578945159912109, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 5160 + }, + { + "epoch": 3.7917125045837916, + "grad_norm": 0.8276378512382507, + "learning_rate": 0.0002, + "loss": 0.8819, + "step": 5170 + }, + { + "epoch": 3.799046571323799, + "grad_norm": 0.7627953886985779, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 5180 + }, + { + "epoch": 3.806380638063806, + "grad_norm": 0.8169086575508118, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 5190 + }, + { + "epoch": 3.8137147048038136, + "grad_norm": 0.6605030298233032, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 5200 + }, + { + "epoch": 3.821048771543821, + "grad_norm": 0.5837286114692688, + "learning_rate": 0.0002, + "loss": 0.8804, + "step": 5210 + }, + { + "epoch": 3.8283828382838285, + "grad_norm": 1.2422157526016235, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 5220 + }, + { + "epoch": 3.8357169050238356, + "grad_norm": 0.6589220762252808, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 5230 + }, + { + "epoch": 3.843050971763843, + "grad_norm": 0.8567556142807007, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5240 + }, + { + "epoch": 3.8503850385038505, + "grad_norm": 0.6490627527236938, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 5250 + }, + { + "epoch": 3.8577191052438575, + "grad_norm": 0.620232880115509, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5260 + }, + { + "epoch": 3.865053171983865, + "grad_norm": 0.7685128450393677, + "learning_rate": 0.0002, + "loss": 0.9192, + "step": 5270 + }, + { + "epoch": 3.8723872387238725, + "grad_norm": 0.8113296627998352, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 5280 + }, + { + "epoch": 3.87972130546388, + "grad_norm": 0.8092675805091858, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 5290 + }, + { + "epoch": 3.887055372203887, + "grad_norm": 0.583570122718811, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 5300 + }, + { + "epoch": 3.8943894389438944, + "grad_norm": 1.712363600730896, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 5310 + }, + { + "epoch": 3.9017235056839015, + "grad_norm": 0.6673534512519836, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5320 + }, + { + "epoch": 3.909057572423909, + "grad_norm": 1.9770312309265137, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5330 + }, + { + "epoch": 3.9163916391639164, + "grad_norm": 0.6430999636650085, + "learning_rate": 0.0002, + "loss": 0.8793, + "step": 5340 + }, + { + "epoch": 3.923725705903924, + "grad_norm": 1.0159571170806885, + "learning_rate": 0.0002, + "loss": 0.839, + "step": 5350 + }, + { + "epoch": 3.931059772643931, + "grad_norm": 0.8607584834098816, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 5360 + }, + { + "epoch": 3.9383938393839384, + "grad_norm": 0.6967900991439819, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 5370 + }, + { + "epoch": 3.945727906123946, + "grad_norm": 0.7683077454566956, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 5380 + }, + { + "epoch": 3.953061972863953, + "grad_norm": 0.6805762648582458, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5390 + }, + { + "epoch": 3.9603960396039604, + "grad_norm": 0.7033619284629822, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5400 + }, + { + "epoch": 3.967730106343968, + "grad_norm": 0.966112494468689, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5410 + }, + { + "epoch": 3.9750641730839753, + "grad_norm": 0.8467881083488464, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 5420 + }, + { + "epoch": 3.9823982398239823, + "grad_norm": 0.8005317449569702, + "learning_rate": 0.0002, + "loss": 0.8084, + "step": 5430 + }, + { + "epoch": 3.98973230656399, + "grad_norm": 1.1615241765975952, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 5440 + }, + { + "epoch": 3.997066373303997, + "grad_norm": 0.6121614575386047, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 5450 + }, + { + "epoch": 4.0, + "eval_loss": 1.1834222078323364, + "eval_runtime": 32.7569, + "eval_samples_per_second": 13.158, + "eval_steps_per_second": 1.649, + "step": 5454 + }, + { + "epoch": 4.004400440044004, + "grad_norm": 0.6055727005004883, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 5460 + }, + { + "epoch": 4.011734506784012, + "grad_norm": 0.8232647180557251, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 5470 + }, + { + "epoch": 4.019068573524019, + "grad_norm": 0.7739192247390747, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 5480 + }, + { + "epoch": 4.026402640264027, + "grad_norm": 0.6264950633049011, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 5490 + }, + { + "epoch": 4.033736707004033, + "grad_norm": 1.4798702001571655, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 5500 + }, + { + "epoch": 4.041070773744041, + "grad_norm": 0.9538470506668091, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 5510 + }, + { + "epoch": 4.048404840484048, + "grad_norm": 0.834561288356781, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 5520 + }, + { + "epoch": 4.055738907224056, + "grad_norm": 0.6407850384712219, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 5530 + }, + { + "epoch": 4.063072973964063, + "grad_norm": 0.9035961627960205, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 5540 + }, + { + "epoch": 4.070407040704071, + "grad_norm": 0.842812716960907, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 5550 + }, + { + "epoch": 4.077741107444078, + "grad_norm": 0.8197882175445557, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 5560 + }, + { + "epoch": 4.085075174184085, + "grad_norm": 0.8652673959732056, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 5570 + }, + { + "epoch": 4.092409240924092, + "grad_norm": 0.8048318028450012, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 5580 + }, + { + "epoch": 4.0997433076641, + "grad_norm": 0.9604969024658203, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 5590 + }, + { + "epoch": 4.107077374404107, + "grad_norm": 1.244756817817688, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 5600 + }, + { + "epoch": 4.114411441144115, + "grad_norm": 0.7975269556045532, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 5610 + }, + { + "epoch": 4.121745507884122, + "grad_norm": 0.6130099296569824, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 5620 + }, + { + "epoch": 4.129079574624129, + "grad_norm": 0.7793202996253967, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 5630 + }, + { + "epoch": 4.136413641364136, + "grad_norm": 1.187238335609436, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 5640 + }, + { + "epoch": 4.143747708104144, + "grad_norm": 0.8450375199317932, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 5650 + }, + { + "epoch": 4.151081774844151, + "grad_norm": 0.9006940126419067, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5660 + }, + { + "epoch": 4.158415841584159, + "grad_norm": 0.9447154998779297, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 5670 + }, + { + "epoch": 4.165749908324166, + "grad_norm": 0.798032283782959, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 5680 + }, + { + "epoch": 4.1730839750641735, + "grad_norm": 0.65578693151474, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 5690 + }, + { + "epoch": 4.18041804180418, + "grad_norm": 1.0864700078964233, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 5700 + }, + { + "epoch": 4.187752108544188, + "grad_norm": 0.7344121932983398, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 5710 + }, + { + "epoch": 4.195086175284195, + "grad_norm": 0.9722456932067871, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 5720 + }, + { + "epoch": 4.2024202420242025, + "grad_norm": 1.263814926147461, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 5730 + }, + { + "epoch": 4.20975430876421, + "grad_norm": 0.9622581005096436, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 5740 + }, + { + "epoch": 4.2170883755042174, + "grad_norm": 0.8497143387794495, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 5750 + }, + { + "epoch": 4.224422442244224, + "grad_norm": 0.8248446583747864, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 5760 + }, + { + "epoch": 4.2317565089842315, + "grad_norm": 1.2544798851013184, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 5770 + }, + { + "epoch": 4.239090575724239, + "grad_norm": 0.8224676251411438, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 5780 + }, + { + "epoch": 4.2464246424642464, + "grad_norm": 0.8924877047538757, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 5790 + }, + { + "epoch": 4.253758709204254, + "grad_norm": 0.8545848727226257, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 5800 + }, + { + "epoch": 4.261092775944261, + "grad_norm": 0.8081067800521851, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 5810 + }, + { + "epoch": 4.268426842684269, + "grad_norm": 0.7111002802848816, + "learning_rate": 0.0002, + "loss": 0.6149, + "step": 5820 + }, + { + "epoch": 4.2757609094242754, + "grad_norm": 0.8696979880332947, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 5830 + }, + { + "epoch": 4.283094976164283, + "grad_norm": 0.821401834487915, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 5840 + }, + { + "epoch": 4.29042904290429, + "grad_norm": 0.888908326625824, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 5850 + }, + { + "epoch": 4.297763109644298, + "grad_norm": 1.9380123615264893, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 5860 + }, + { + "epoch": 4.305097176384305, + "grad_norm": 1.121774435043335, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 5870 + }, + { + "epoch": 4.312431243124313, + "grad_norm": 0.9238282442092896, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 5880 + }, + { + "epoch": 4.319765309864319, + "grad_norm": 0.7321620583534241, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 5890 + }, + { + "epoch": 4.327099376604327, + "grad_norm": 0.8739548325538635, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 5900 + }, + { + "epoch": 4.334433443344334, + "grad_norm": 0.9686012268066406, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 5910 + }, + { + "epoch": 4.341767510084342, + "grad_norm": 0.9033839106559753, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 5920 + }, + { + "epoch": 4.349101576824349, + "grad_norm": 0.8131115436553955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 5930 + }, + { + "epoch": 4.356435643564357, + "grad_norm": 0.8942412734031677, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 5940 + }, + { + "epoch": 4.363769710304364, + "grad_norm": 0.8439112901687622, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 5950 + }, + { + "epoch": 4.371103777044371, + "grad_norm": 0.9176713228225708, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 5960 + }, + { + "epoch": 4.378437843784378, + "grad_norm": 0.6799634695053101, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 5970 + }, + { + "epoch": 4.385771910524386, + "grad_norm": 1.0435824394226074, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5980 + }, + { + "epoch": 4.393105977264393, + "grad_norm": 0.997937798500061, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 5990 + }, + { + "epoch": 4.400440044004401, + "grad_norm": 1.0308842658996582, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 6000 + }, + { + "epoch": 4.407774110744408, + "grad_norm": 1.3683775663375854, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 6010 + }, + { + "epoch": 4.415108177484415, + "grad_norm": 0.7569534182548523, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 6020 + }, + { + "epoch": 4.422442244224422, + "grad_norm": 1.089978575706482, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 6030 + }, + { + "epoch": 4.42977631096443, + "grad_norm": 0.7522459626197815, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 6040 + }, + { + "epoch": 4.437110377704437, + "grad_norm": 0.6709823608398438, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 6050 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.6992089748382568, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 6060 + }, + { + "epoch": 4.451778511184452, + "grad_norm": 1.0182931423187256, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 6070 + }, + { + "epoch": 4.459112577924459, + "grad_norm": 1.0685160160064697, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 6080 + }, + { + "epoch": 4.466446644664466, + "grad_norm": 0.8295124769210815, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 6090 + }, + { + "epoch": 4.473780711404474, + "grad_norm": 1.1862998008728027, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6100 + }, + { + "epoch": 4.481114778144481, + "grad_norm": 0.7400273084640503, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 6110 + }, + { + "epoch": 4.488448844884489, + "grad_norm": 0.7098417282104492, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 6120 + }, + { + "epoch": 4.495782911624496, + "grad_norm": 0.9745053648948669, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 6130 + }, + { + "epoch": 4.503116978364503, + "grad_norm": 0.8638797998428345, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 6140 + }, + { + "epoch": 4.51045104510451, + "grad_norm": 0.8291046619415283, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6150 + }, + { + "epoch": 4.517785111844518, + "grad_norm": 1.0301737785339355, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 6160 + }, + { + "epoch": 4.525119178584525, + "grad_norm": 1.1996512413024902, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 6170 + }, + { + "epoch": 4.5324532453245325, + "grad_norm": 1.151038408279419, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 6180 + }, + { + "epoch": 4.53978731206454, + "grad_norm": 0.8385201096534729, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 6190 + }, + { + "epoch": 4.5471213788045475, + "grad_norm": 0.8969188332557678, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 6200 + }, + { + "epoch": 4.554455445544555, + "grad_norm": 1.60659658908844, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 6210 + }, + { + "epoch": 4.5617895122845615, + "grad_norm": 0.9356731176376343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 6220 + }, + { + "epoch": 4.569123579024569, + "grad_norm": 0.95856773853302, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 6230 + }, + { + "epoch": 4.5764576457645765, + "grad_norm": 1.1162524223327637, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 6240 + }, + { + "epoch": 4.583791712504584, + "grad_norm": 0.8809238076210022, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 6250 + }, + { + "epoch": 4.591125779244591, + "grad_norm": 0.890738844871521, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 6260 + }, + { + "epoch": 4.598459845984598, + "grad_norm": 0.918684720993042, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 6270 + }, + { + "epoch": 4.6057939127246055, + "grad_norm": 0.8156296610832214, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 6280 + }, + { + "epoch": 4.613127979464613, + "grad_norm": 1.046634316444397, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6290 + }, + { + "epoch": 4.62046204620462, + "grad_norm": 0.7725525498390198, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 6300 + }, + { + "epoch": 4.627796112944628, + "grad_norm": 0.9992046356201172, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 6310 + }, + { + "epoch": 4.635130179684635, + "grad_norm": 0.8480095267295837, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 6320 + }, + { + "epoch": 4.642464246424643, + "grad_norm": 0.7061955332756042, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 6330 + }, + { + "epoch": 4.649798313164649, + "grad_norm": 1.0354212522506714, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 6340 + }, + { + "epoch": 4.657132379904657, + "grad_norm": 1.0081377029418945, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6350 + }, + { + "epoch": 4.664466446644664, + "grad_norm": 1.2904249429702759, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 6360 + }, + { + "epoch": 4.671800513384672, + "grad_norm": 0.9248910546302795, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 6370 + }, + { + "epoch": 4.679134580124679, + "grad_norm": 0.9907804131507874, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 6380 + }, + { + "epoch": 4.686468646864687, + "grad_norm": 1.201143741607666, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 6390 + }, + { + "epoch": 4.693802713604693, + "grad_norm": 0.8709394335746765, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 6400 + }, + { + "epoch": 4.701136780344701, + "grad_norm": 0.7468608021736145, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 6410 + }, + { + "epoch": 4.708470847084708, + "grad_norm": 0.8607903718948364, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 6420 + }, + { + "epoch": 4.715804913824716, + "grad_norm": 0.9840512871742249, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 6430 + }, + { + "epoch": 4.723138980564723, + "grad_norm": 0.8328204154968262, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 6440 + }, + { + "epoch": 4.730473047304731, + "grad_norm": 0.924505352973938, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 6450 + }, + { + "epoch": 4.737807114044738, + "grad_norm": 0.8897685408592224, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 6460 + }, + { + "epoch": 4.745141180784745, + "grad_norm": 0.9605024456977844, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6470 + }, + { + "epoch": 4.752475247524752, + "grad_norm": 0.8150759935379028, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 6480 + }, + { + "epoch": 4.75980931426476, + "grad_norm": 0.8128412961959839, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 6490 + }, + { + "epoch": 4.767143381004767, + "grad_norm": 0.7381404638290405, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 6500 + }, + { + "epoch": 4.774477447744775, + "grad_norm": 1.0565853118896484, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 6510 + }, + { + "epoch": 4.781811514484782, + "grad_norm": 0.9298134446144104, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6520 + }, + { + "epoch": 4.789145581224789, + "grad_norm": 1.0145525932312012, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 6530 + }, + { + "epoch": 4.796479647964796, + "grad_norm": 0.92259681224823, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 6540 + }, + { + "epoch": 4.803813714704804, + "grad_norm": 0.7881024479866028, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 6550 + }, + { + "epoch": 4.811147781444811, + "grad_norm": 1.4935206174850464, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 6560 + }, + { + "epoch": 4.818481848184819, + "grad_norm": 0.8612369298934937, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 6570 + }, + { + "epoch": 4.825815914924826, + "grad_norm": 1.0118653774261475, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 6580 + }, + { + "epoch": 4.833149981664834, + "grad_norm": 1.1303809881210327, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 6590 + }, + { + "epoch": 4.84048404840484, + "grad_norm": 0.9112492203712463, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 6600 + }, + { + "epoch": 4.847818115144848, + "grad_norm": 0.864762544631958, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 6610 + }, + { + "epoch": 4.855152181884855, + "grad_norm": 0.9090572595596313, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 6620 + }, + { + "epoch": 4.862486248624863, + "grad_norm": 1.014953374862671, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 6630 + }, + { + "epoch": 4.86982031536487, + "grad_norm": 1.0702149868011475, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 6640 + }, + { + "epoch": 4.8771543821048775, + "grad_norm": 1.002135157585144, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 6650 + }, + { + "epoch": 4.884488448844884, + "grad_norm": 0.862545907497406, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 6660 + }, + { + "epoch": 4.891822515584892, + "grad_norm": 0.7302131056785583, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 6670 + }, + { + "epoch": 4.899156582324899, + "grad_norm": 0.8380730152130127, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 6680 + }, + { + "epoch": 4.9064906490649065, + "grad_norm": 0.7956018447875977, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 6690 + }, + { + "epoch": 4.913824715804914, + "grad_norm": 0.6717583537101746, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 6700 + }, + { + "epoch": 4.9211587825449215, + "grad_norm": 1.09099280834198, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 6710 + }, + { + "epoch": 4.928492849284929, + "grad_norm": 0.8589889407157898, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 6720 + }, + { + "epoch": 4.9358269160249355, + "grad_norm": 1.0046314001083374, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 6730 + }, + { + "epoch": 4.943160982764943, + "grad_norm": 0.8559659123420715, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 6740 + }, + { + "epoch": 4.9504950495049505, + "grad_norm": 0.8588525652885437, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 6750 + }, + { + "epoch": 4.957829116244958, + "grad_norm": 0.9192708134651184, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 6760 + }, + { + "epoch": 4.965163182984965, + "grad_norm": 1.051398754119873, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 6770 + }, + { + "epoch": 4.972497249724973, + "grad_norm": 0.9111362099647522, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 6780 + }, + { + "epoch": 4.9798313164649795, + "grad_norm": 0.7305638194084167, + "learning_rate": 0.0002, + "loss": 0.7613, + "step": 6790 + }, + { + "epoch": 4.987165383204987, + "grad_norm": 1.118837594985962, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 6800 + }, + { + "epoch": 4.994499449944994, + "grad_norm": 0.9075239300727844, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 6810 + }, + { + "epoch": 4.999633296662999, + "eval_loss": 1.2361247539520264, + "eval_runtime": 32.7325, + "eval_samples_per_second": 13.167, + "eval_steps_per_second": 1.65, + "step": 6817 + }, + { + "epoch": 5.001833516685002, + "grad_norm": 1.0541315078735352, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 6820 + }, + { + "epoch": 5.009167583425009, + "grad_norm": 0.9750140905380249, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 6830 + }, + { + "epoch": 5.016501650165017, + "grad_norm": 0.931838870048523, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 6840 + }, + { + "epoch": 5.023835716905023, + "grad_norm": 1.110278844833374, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 6850 + }, + { + "epoch": 5.031169783645031, + "grad_norm": 1.0670180320739746, + "learning_rate": 0.0002, + "loss": 0.4676, + "step": 6860 + }, + { + "epoch": 5.038503850385038, + "grad_norm": 0.8762092590332031, + "learning_rate": 0.0002, + "loss": 0.4374, + "step": 6870 + }, + { + "epoch": 5.045837917125046, + "grad_norm": 1.1169432401657104, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 6880 + }, + { + "epoch": 5.053171983865053, + "grad_norm": 1.005491018295288, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 6890 + }, + { + "epoch": 5.060506050605061, + "grad_norm": 1.1751841306686401, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 6900 + }, + { + "epoch": 5.067840117345068, + "grad_norm": 0.8501367568969727, + "learning_rate": 0.0002, + "loss": 0.451, + "step": 6910 + }, + { + "epoch": 5.075174184085075, + "grad_norm": 0.9795131683349609, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 6920 + }, + { + "epoch": 5.082508250825082, + "grad_norm": 0.8929879665374756, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 6930 + }, + { + "epoch": 5.08984231756509, + "grad_norm": 1.0156651735305786, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 6940 + }, + { + "epoch": 5.097176384305097, + "grad_norm": 1.0974335670471191, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 6950 + }, + { + "epoch": 5.104510451045105, + "grad_norm": 1.7015666961669922, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 6960 + }, + { + "epoch": 5.111844517785112, + "grad_norm": 1.0343226194381714, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 6970 + }, + { + "epoch": 5.119178584525119, + "grad_norm": 1.3072983026504517, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 6980 + }, + { + "epoch": 5.126512651265126, + "grad_norm": 1.038986086845398, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 6990 + }, + { + "epoch": 5.133846718005134, + "grad_norm": 0.8638386130332947, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 7000 + }, + { + "epoch": 5.141180784745141, + "grad_norm": 0.8326523900032043, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 7010 + }, + { + "epoch": 5.148514851485149, + "grad_norm": 1.0976895093917847, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 7020 + }, + { + "epoch": 5.155848918225156, + "grad_norm": 1.0077873468399048, + "learning_rate": 0.0002, + "loss": 0.4677, + "step": 7030 + }, + { + "epoch": 5.163182984965164, + "grad_norm": 1.0662257671356201, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 7040 + }, + { + "epoch": 5.17051705170517, + "grad_norm": 1.206271767616272, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 7050 + }, + { + "epoch": 5.177851118445178, + "grad_norm": 1.1990262269973755, + "learning_rate": 0.0002, + "loss": 0.4817, + "step": 7060 + }, + { + "epoch": 5.185185185185185, + "grad_norm": 1.0207163095474243, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 7070 + }, + { + "epoch": 5.192519251925193, + "grad_norm": 1.2783987522125244, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 7080 + }, + { + "epoch": 5.1998533186652, + "grad_norm": 1.1592512130737305, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 7090 + }, + { + "epoch": 5.2071873854052075, + "grad_norm": 1.1053160429000854, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 7100 + }, + { + "epoch": 5.214521452145214, + "grad_norm": 1.1925510168075562, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 7110 + }, + { + "epoch": 5.221855518885222, + "grad_norm": 1.0714877843856812, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 7120 + }, + { + "epoch": 5.229189585625229, + "grad_norm": 0.9451011419296265, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 7130 + }, + { + "epoch": 5.2365236523652365, + "grad_norm": 1.03838050365448, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 7140 + }, + { + "epoch": 5.243857719105244, + "grad_norm": 0.9204146265983582, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 7150 + }, + { + "epoch": 5.2511917858452515, + "grad_norm": 1.0142229795455933, + "learning_rate": 0.0002, + "loss": 0.5164, + "step": 7160 + }, + { + "epoch": 5.258525852585258, + "grad_norm": 1.4432005882263184, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 7170 + }, + { + "epoch": 5.2658599193252655, + "grad_norm": 1.1239633560180664, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 7180 + }, + { + "epoch": 5.273193986065273, + "grad_norm": 0.7012821435928345, + "learning_rate": 0.0002, + "loss": 0.4969, + "step": 7190 + }, + { + "epoch": 5.2805280528052805, + "grad_norm": 1.3499128818511963, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 7200 + }, + { + "epoch": 5.287862119545288, + "grad_norm": 0.9498730897903442, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 7210 + }, + { + "epoch": 5.295196186285295, + "grad_norm": 0.9552369117736816, + "learning_rate": 0.0002, + "loss": 0.5051, + "step": 7220 + }, + { + "epoch": 5.302530253025303, + "grad_norm": 0.7610348463058472, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 7230 + }, + { + "epoch": 5.3098643197653095, + "grad_norm": 1.0314512252807617, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 7240 + }, + { + "epoch": 5.317198386505317, + "grad_norm": 1.0534334182739258, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 7250 + }, + { + "epoch": 5.324532453245324, + "grad_norm": 1.2553406953811646, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 7260 + }, + { + "epoch": 5.331866519985332, + "grad_norm": 0.7061691880226135, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 7270 + }, + { + "epoch": 5.339200586725339, + "grad_norm": 0.9652578830718994, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 7280 + }, + { + "epoch": 5.346534653465347, + "grad_norm": 1.114788293838501, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 7290 + }, + { + "epoch": 5.353868720205353, + "grad_norm": 1.0940049886703491, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 7300 + }, + { + "epoch": 5.361202786945361, + "grad_norm": 1.0151008367538452, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 7310 + }, + { + "epoch": 5.368536853685368, + "grad_norm": 1.0369552373886108, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 7320 + }, + { + "epoch": 5.375870920425376, + "grad_norm": 0.8489866256713867, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 7330 + }, + { + "epoch": 5.383204987165383, + "grad_norm": 1.1031713485717773, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 7340 + }, + { + "epoch": 5.390539053905391, + "grad_norm": 0.9094716310501099, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 7350 + }, + { + "epoch": 5.397873120645398, + "grad_norm": 0.9530431032180786, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 7360 + }, + { + "epoch": 5.405207187385405, + "grad_norm": 0.9633604884147644, + "learning_rate": 0.0002, + "loss": 0.529, + "step": 7370 + }, + { + "epoch": 5.412541254125412, + "grad_norm": 0.9541662335395813, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 7380 + }, + { + "epoch": 5.41987532086542, + "grad_norm": 1.0459771156311035, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7390 + }, + { + "epoch": 5.427209387605427, + "grad_norm": 1.027388334274292, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 7400 + }, + { + "epoch": 5.434543454345435, + "grad_norm": 0.7267653346061707, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 7410 + }, + { + "epoch": 5.441877521085442, + "grad_norm": 1.020142674446106, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 7420 + }, + { + "epoch": 5.449211587825449, + "grad_norm": 1.044754147529602, + "learning_rate": 0.0002, + "loss": 0.4853, + "step": 7430 + }, + { + "epoch": 5.456545654565456, + "grad_norm": 1.5476195812225342, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 7440 + }, + { + "epoch": 5.463879721305464, + "grad_norm": 0.9879506826400757, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 7450 + }, + { + "epoch": 5.471213788045471, + "grad_norm": 1.2562980651855469, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 7460 + }, + { + "epoch": 5.478547854785479, + "grad_norm": 1.3051384687423706, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 7470 + }, + { + "epoch": 5.485881921525486, + "grad_norm": 1.0511597394943237, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 7480 + }, + { + "epoch": 5.493215988265494, + "grad_norm": 1.0380817651748657, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 7490 + }, + { + "epoch": 5.5005500550055, + "grad_norm": 1.170274257659912, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 7500 + }, + { + "epoch": 5.507884121745508, + "grad_norm": 1.3356517553329468, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 7510 + }, + { + "epoch": 5.515218188485515, + "grad_norm": 1.0727124214172363, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 7520 + }, + { + "epoch": 5.522552255225523, + "grad_norm": 1.0110199451446533, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 7530 + }, + { + "epoch": 5.52988632196553, + "grad_norm": 1.3086743354797363, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 7540 + }, + { + "epoch": 5.537220388705538, + "grad_norm": 1.1904916763305664, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 7550 + }, + { + "epoch": 5.544554455445544, + "grad_norm": 0.9466280937194824, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 7560 + }, + { + "epoch": 5.551888522185552, + "grad_norm": 1.1237901449203491, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 7570 + }, + { + "epoch": 5.559222588925559, + "grad_norm": 0.9590660333633423, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 7580 + }, + { + "epoch": 5.566556655665567, + "grad_norm": 1.0890778303146362, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 7590 + }, + { + "epoch": 5.573890722405574, + "grad_norm": 0.7206931114196777, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 7600 + }, + { + "epoch": 5.5812247891455815, + "grad_norm": 1.2884514331817627, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 7610 + }, + { + "epoch": 5.588558855885589, + "grad_norm": 0.7798039317131042, + "learning_rate": 0.0002, + "loss": 0.5279, + "step": 7620 + }, + { + "epoch": 5.595892922625596, + "grad_norm": 1.166046142578125, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 7630 + }, + { + "epoch": 5.603226989365603, + "grad_norm": 1.0150201320648193, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 7640 + }, + { + "epoch": 5.6105610561056105, + "grad_norm": 1.0449682474136353, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 7650 + }, + { + "epoch": 5.617895122845618, + "grad_norm": 0.9310530424118042, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 7660 + }, + { + "epoch": 5.6252291895856255, + "grad_norm": 0.9117933511734009, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 7670 + }, + { + "epoch": 5.632563256325633, + "grad_norm": 1.1475164890289307, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 7680 + }, + { + "epoch": 5.6398973230656395, + "grad_norm": 1.066809058189392, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 7690 + }, + { + "epoch": 5.647231389805647, + "grad_norm": 1.2834991216659546, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 7700 + }, + { + "epoch": 5.6545654565456545, + "grad_norm": 1.2245112657546997, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 7710 + }, + { + "epoch": 5.661899523285662, + "grad_norm": 1.1424106359481812, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 7720 + }, + { + "epoch": 5.669233590025669, + "grad_norm": 1.0673892498016357, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 7730 + }, + { + "epoch": 5.676567656765677, + "grad_norm": 1.4312121868133545, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 7740 + }, + { + "epoch": 5.683901723505684, + "grad_norm": 0.9976982474327087, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 7750 + }, + { + "epoch": 5.691235790245691, + "grad_norm": 0.9464678168296814, + "learning_rate": 0.0002, + "loss": 0.4855, + "step": 7760 + }, + { + "epoch": 5.698569856985698, + "grad_norm": 1.010995626449585, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 7770 + }, + { + "epoch": 5.705903923725706, + "grad_norm": 1.3787750005722046, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 7780 + }, + { + "epoch": 5.713237990465713, + "grad_norm": 1.020922303199768, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 7790 + }, + { + "epoch": 5.720572057205721, + "grad_norm": 0.9748636484146118, + "learning_rate": 0.0002, + "loss": 0.5337, + "step": 7800 + }, + { + "epoch": 5.727906123945728, + "grad_norm": 1.3077744245529175, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 7810 + }, + { + "epoch": 5.735240190685735, + "grad_norm": 1.4770057201385498, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 7820 + }, + { + "epoch": 5.742574257425742, + "grad_norm": 1.6349090337753296, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 7830 + }, + { + "epoch": 5.74990832416575, + "grad_norm": 0.9818630814552307, + "learning_rate": 0.0002, + "loss": 0.5056, + "step": 7840 + }, + { + "epoch": 5.757242390905757, + "grad_norm": 0.9659715890884399, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 7850 + }, + { + "epoch": 5.764576457645765, + "grad_norm": 0.9269950985908508, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 7860 + }, + { + "epoch": 5.771910524385772, + "grad_norm": 1.0099073648452759, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 7870 + }, + { + "epoch": 5.77924459112578, + "grad_norm": 0.9123615026473999, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 7880 + }, + { + "epoch": 5.786578657865786, + "grad_norm": 1.1542246341705322, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 7890 + }, + { + "epoch": 5.793912724605794, + "grad_norm": 1.0792022943496704, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 7900 + }, + { + "epoch": 5.801246791345801, + "grad_norm": 0.95615553855896, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 7910 + }, + { + "epoch": 5.808580858085809, + "grad_norm": 1.2471332550048828, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 7920 + }, + { + "epoch": 5.815914924825816, + "grad_norm": 1.0189851522445679, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 7930 + }, + { + "epoch": 5.823248991565823, + "grad_norm": 1.3309742212295532, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 7940 + }, + { + "epoch": 5.83058305830583, + "grad_norm": 1.2930549383163452, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 7950 + }, + { + "epoch": 5.837917125045838, + "grad_norm": 0.8216308951377869, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 7960 + }, + { + "epoch": 5.845251191785845, + "grad_norm": 1.1205775737762451, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 7970 + }, + { + "epoch": 5.852585258525853, + "grad_norm": 0.851298451423645, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 7980 + }, + { + "epoch": 5.85991932526586, + "grad_norm": 0.8797095417976379, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 7990 + }, + { + "epoch": 5.867253392005868, + "grad_norm": 1.5784614086151123, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 8000 + }, + { + "epoch": 5.874587458745875, + "grad_norm": 1.1531187295913696, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 8010 + }, + { + "epoch": 5.881921525485882, + "grad_norm": 1.2469146251678467, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 8020 + }, + { + "epoch": 5.889255592225889, + "grad_norm": 1.0784350633621216, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 8030 + }, + { + "epoch": 5.896589658965897, + "grad_norm": 1.1311599016189575, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 8040 + }, + { + "epoch": 5.903923725705904, + "grad_norm": 0.9654512405395508, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 8050 + }, + { + "epoch": 5.9112577924459115, + "grad_norm": 1.3288270235061646, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 8060 + }, + { + "epoch": 5.918591859185918, + "grad_norm": 1.12800931930542, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 8070 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 0.9449917674064636, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 8080 + }, + { + "epoch": 5.933259992665933, + "grad_norm": 1.1532357931137085, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 8090 + }, + { + "epoch": 5.9405940594059405, + "grad_norm": 1.2211151123046875, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 8100 + }, + { + "epoch": 5.947928126145948, + "grad_norm": 1.3459105491638184, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 8110 + }, + { + "epoch": 5.9552621928859555, + "grad_norm": 1.251999855041504, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 8120 + }, + { + "epoch": 5.962596259625963, + "grad_norm": 1.5682506561279297, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 8130 + }, + { + "epoch": 5.9699303263659695, + "grad_norm": 0.926075279712677, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 8140 + }, + { + "epoch": 5.977264393105977, + "grad_norm": 0.9622511863708496, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 8150 + }, + { + "epoch": 5.9845984598459845, + "grad_norm": 0.9633373618125916, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 8160 + }, + { + "epoch": 5.991932526585992, + "grad_norm": 0.8960476517677307, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 8170 + }, + { + "epoch": 5.999266593325999, + "grad_norm": 0.9372805953025818, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 8180 + }, + { + "epoch": 6.0, + "eval_loss": 1.3233846426010132, + "eval_runtime": 32.7419, + "eval_samples_per_second": 13.164, + "eval_steps_per_second": 1.649, + "step": 8181 + }, + { + "epoch": 6.006600660066007, + "grad_norm": 1.1900787353515625, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 8190 + }, + { + "epoch": 6.013934726806014, + "grad_norm": 1.1448326110839844, + "learning_rate": 0.0002, + "loss": 0.4509, + "step": 8200 + }, + { + "epoch": 6.021268793546021, + "grad_norm": 1.1848368644714355, + "learning_rate": 0.0002, + "loss": 0.3667, + "step": 8210 + }, + { + "epoch": 6.028602860286028, + "grad_norm": 1.2315572500228882, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 8220 + }, + { + "epoch": 6.035936927026036, + "grad_norm": 1.2214244604110718, + "learning_rate": 0.0002, + "loss": 0.3541, + "step": 8230 + }, + { + "epoch": 6.043270993766043, + "grad_norm": 0.9455513954162598, + "learning_rate": 0.0002, + "loss": 0.4025, + "step": 8240 + }, + { + "epoch": 6.050605060506051, + "grad_norm": 0.9574248790740967, + "learning_rate": 0.0002, + "loss": 0.4448, + "step": 8250 + }, + { + "epoch": 6.057939127246058, + "grad_norm": 1.1022400856018066, + "learning_rate": 0.0002, + "loss": 0.4271, + "step": 8260 + }, + { + "epoch": 6.065273193986065, + "grad_norm": 0.9555122256278992, + "learning_rate": 0.0002, + "loss": 0.3603, + "step": 8270 + }, + { + "epoch": 6.072607260726072, + "grad_norm": 1.1956106424331665, + "learning_rate": 0.0002, + "loss": 0.4324, + "step": 8280 + }, + { + "epoch": 6.07994132746608, + "grad_norm": 1.3110876083374023, + "learning_rate": 0.0002, + "loss": 0.3924, + "step": 8290 + }, + { + "epoch": 6.087275394206087, + "grad_norm": 1.1293374300003052, + "learning_rate": 0.0002, + "loss": 0.3664, + "step": 8300 + }, + { + "epoch": 6.094609460946095, + "grad_norm": 0.9176164269447327, + "learning_rate": 0.0002, + "loss": 0.385, + "step": 8310 + }, + { + "epoch": 6.101943527686102, + "grad_norm": 0.9751231670379639, + "learning_rate": 0.0002, + "loss": 0.4142, + "step": 8320 + }, + { + "epoch": 6.109277594426109, + "grad_norm": 1.0536044836044312, + "learning_rate": 0.0002, + "loss": 0.4356, + "step": 8330 + }, + { + "epoch": 6.116611661166116, + "grad_norm": 1.289342999458313, + "learning_rate": 0.0002, + "loss": 0.409, + "step": 8340 + }, + { + "epoch": 6.123945727906124, + "grad_norm": 1.1773661375045776, + "learning_rate": 0.0002, + "loss": 0.4121, + "step": 8350 + }, + { + "epoch": 6.131279794646131, + "grad_norm": 1.2450661659240723, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 8360 + }, + { + "epoch": 6.138613861386139, + "grad_norm": 1.3965914249420166, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 8370 + }, + { + "epoch": 6.145947928126146, + "grad_norm": 1.3530808687210083, + "learning_rate": 0.0002, + "loss": 0.4024, + "step": 8380 + }, + { + "epoch": 6.153281994866154, + "grad_norm": 1.296276330947876, + "learning_rate": 0.0002, + "loss": 0.4658, + "step": 8390 + }, + { + "epoch": 6.16061606160616, + "grad_norm": 0.9759053587913513, + "learning_rate": 0.0002, + "loss": 0.5073, + "step": 8400 + }, + { + "epoch": 6.167950128346168, + "grad_norm": 1.2110707759857178, + "learning_rate": 0.0002, + "loss": 0.4718, + "step": 8410 + }, + { + "epoch": 6.175284195086175, + "grad_norm": 1.312226414680481, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 8420 + }, + { + "epoch": 6.182618261826183, + "grad_norm": 1.1696736812591553, + "learning_rate": 0.0002, + "loss": 0.4183, + "step": 8430 + }, + { + "epoch": 6.18995232856619, + "grad_norm": 1.260304570198059, + "learning_rate": 0.0002, + "loss": 0.4546, + "step": 8440 + }, + { + "epoch": 6.197286395306198, + "grad_norm": 1.472961187362671, + "learning_rate": 0.0002, + "loss": 0.4137, + "step": 8450 + }, + { + "epoch": 6.204620462046204, + "grad_norm": 1.3618475198745728, + "learning_rate": 0.0002, + "loss": 0.42, + "step": 8460 + }, + { + "epoch": 6.211954528786212, + "grad_norm": 1.2544318437576294, + "learning_rate": 0.0002, + "loss": 0.415, + "step": 8470 + }, + { + "epoch": 6.219288595526219, + "grad_norm": 1.205898642539978, + "learning_rate": 0.0002, + "loss": 0.3907, + "step": 8480 + }, + { + "epoch": 6.226622662266227, + "grad_norm": 0.9984724521636963, + "learning_rate": 0.0002, + "loss": 0.4431, + "step": 8490 + }, + { + "epoch": 6.233956729006234, + "grad_norm": 1.3184109926223755, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 8500 + }, + { + "epoch": 6.241290795746242, + "grad_norm": 1.135520100593567, + "learning_rate": 0.0002, + "loss": 0.3859, + "step": 8510 + }, + { + "epoch": 6.248624862486249, + "grad_norm": 1.4528400897979736, + "learning_rate": 0.0002, + "loss": 0.4159, + "step": 8520 + }, + { + "epoch": 6.255958929226256, + "grad_norm": 1.1222716569900513, + "learning_rate": 0.0002, + "loss": 0.4347, + "step": 8530 + }, + { + "epoch": 6.263292995966263, + "grad_norm": 1.7878046035766602, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 8540 + }, + { + "epoch": 6.270627062706271, + "grad_norm": 0.9789481163024902, + "learning_rate": 0.0002, + "loss": 0.4298, + "step": 8550 + }, + { + "epoch": 6.277961129446278, + "grad_norm": 1.151977300643921, + "learning_rate": 0.0002, + "loss": 0.4316, + "step": 8560 + }, + { + "epoch": 6.2852951961862855, + "grad_norm": 1.389968752861023, + "learning_rate": 0.0002, + "loss": 0.428, + "step": 8570 + }, + { + "epoch": 6.292629262926293, + "grad_norm": 0.884211003780365, + "learning_rate": 0.0002, + "loss": 0.3903, + "step": 8580 + }, + { + "epoch": 6.2999633296663, + "grad_norm": 1.3604296445846558, + "learning_rate": 0.0002, + "loss": 0.4611, + "step": 8590 + }, + { + "epoch": 6.307297396406307, + "grad_norm": 1.1845694780349731, + "learning_rate": 0.0002, + "loss": 0.4183, + "step": 8600 + }, + { + "epoch": 6.3146314631463145, + "grad_norm": 1.3231550455093384, + "learning_rate": 0.0002, + "loss": 0.472, + "step": 8610 + }, + { + "epoch": 6.321965529886322, + "grad_norm": 0.9546721577644348, + "learning_rate": 0.0002, + "loss": 0.3922, + "step": 8620 + }, + { + "epoch": 6.3292995966263295, + "grad_norm": 1.2329787015914917, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 8630 + }, + { + "epoch": 6.336633663366337, + "grad_norm": 1.0240199565887451, + "learning_rate": 0.0002, + "loss": 0.4344, + "step": 8640 + }, + { + "epoch": 6.343967730106344, + "grad_norm": 1.1866962909698486, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 8650 + }, + { + "epoch": 6.351301796846351, + "grad_norm": 1.2819687128067017, + "learning_rate": 0.0002, + "loss": 0.4575, + "step": 8660 + }, + { + "epoch": 6.3586358635863585, + "grad_norm": 0.9654944539070129, + "learning_rate": 0.0002, + "loss": 0.455, + "step": 8670 + }, + { + "epoch": 6.365969930326366, + "grad_norm": 0.9443874955177307, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 8680 + }, + { + "epoch": 6.373303997066373, + "grad_norm": 1.2914115190505981, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 8690 + }, + { + "epoch": 6.380638063806381, + "grad_norm": 1.4558709859848022, + "learning_rate": 0.0002, + "loss": 0.4392, + "step": 8700 + }, + { + "epoch": 6.387972130546388, + "grad_norm": 1.3255952596664429, + "learning_rate": 0.0002, + "loss": 0.4398, + "step": 8710 + }, + { + "epoch": 6.395306197286395, + "grad_norm": 1.348742961883545, + "learning_rate": 0.0002, + "loss": 0.4451, + "step": 8720 + }, + { + "epoch": 6.402640264026402, + "grad_norm": 1.0096025466918945, + "learning_rate": 0.0002, + "loss": 0.41, + "step": 8730 + }, + { + "epoch": 6.40997433076641, + "grad_norm": 1.1720590591430664, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 8740 + }, + { + "epoch": 6.417308397506417, + "grad_norm": 1.1803077459335327, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 8750 + }, + { + "epoch": 6.424642464246425, + "grad_norm": 1.3649998903274536, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 8760 + }, + { + "epoch": 6.431976530986432, + "grad_norm": 1.1503992080688477, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 8770 + }, + { + "epoch": 6.43931059772644, + "grad_norm": 1.1537176370620728, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 8780 + }, + { + "epoch": 6.446644664466446, + "grad_norm": 0.9743003845214844, + "learning_rate": 0.0002, + "loss": 0.4167, + "step": 8790 + }, + { + "epoch": 6.453978731206454, + "grad_norm": 0.9097744822502136, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 8800 + }, + { + "epoch": 6.461312797946461, + "grad_norm": 2.0174002647399902, + "learning_rate": 0.0002, + "loss": 0.4809, + "step": 8810 + }, + { + "epoch": 6.468646864686469, + "grad_norm": 1.0809309482574463, + "learning_rate": 0.0002, + "loss": 0.4879, + "step": 8820 + }, + { + "epoch": 6.475980931426476, + "grad_norm": 1.100294828414917, + "learning_rate": 0.0002, + "loss": 0.4235, + "step": 8830 + }, + { + "epoch": 6.483314998166484, + "grad_norm": 1.3707489967346191, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 8840 + }, + { + "epoch": 6.49064906490649, + "grad_norm": 1.1304761171340942, + "learning_rate": 0.0002, + "loss": 0.4533, + "step": 8850 + }, + { + "epoch": 6.497983131646498, + "grad_norm": 1.2171573638916016, + "learning_rate": 0.0002, + "loss": 0.4596, + "step": 8860 + }, + { + "epoch": 6.505317198386505, + "grad_norm": 1.0452901124954224, + "learning_rate": 0.0002, + "loss": 0.4694, + "step": 8870 + }, + { + "epoch": 6.512651265126513, + "grad_norm": 1.197298526763916, + "learning_rate": 0.0002, + "loss": 0.4855, + "step": 8880 + }, + { + "epoch": 6.51998533186652, + "grad_norm": 0.9179880619049072, + "learning_rate": 0.0002, + "loss": 0.4167, + "step": 8890 + }, + { + "epoch": 6.527319398606528, + "grad_norm": 1.415079951286316, + "learning_rate": 0.0002, + "loss": 0.445, + "step": 8900 + }, + { + "epoch": 6.534653465346535, + "grad_norm": 1.1032487154006958, + "learning_rate": 0.0002, + "loss": 0.424, + "step": 8910 + }, + { + "epoch": 6.541987532086542, + "grad_norm": 1.2295007705688477, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 8920 + }, + { + "epoch": 6.549321598826549, + "grad_norm": 1.4223219156265259, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 8930 + }, + { + "epoch": 6.556655665566557, + "grad_norm": 1.2785786390304565, + "learning_rate": 0.0002, + "loss": 0.4597, + "step": 8940 + }, + { + "epoch": 6.563989732306564, + "grad_norm": 1.3514775037765503, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 8950 + }, + { + "epoch": 6.571323799046572, + "grad_norm": 1.107937216758728, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 8960 + }, + { + "epoch": 6.578657865786578, + "grad_norm": 1.2839902639389038, + "learning_rate": 0.0002, + "loss": 0.4954, + "step": 8970 + }, + { + "epoch": 6.585991932526586, + "grad_norm": 0.9793244004249573, + "learning_rate": 0.0002, + "loss": 0.4207, + "step": 8980 + }, + { + "epoch": 6.593325999266593, + "grad_norm": 1.3403126001358032, + "learning_rate": 0.0002, + "loss": 0.4989, + "step": 8990 + }, + { + "epoch": 6.600660066006601, + "grad_norm": 1.2612813711166382, + "learning_rate": 0.0002, + "loss": 0.465, + "step": 9000 + }, + { + "epoch": 6.607994132746608, + "grad_norm": 1.4347625970840454, + "learning_rate": 0.0002, + "loss": 0.4589, + "step": 9010 + }, + { + "epoch": 6.6153281994866155, + "grad_norm": 1.225921869277954, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 9020 + }, + { + "epoch": 6.622662266226623, + "grad_norm": 1.033644676208496, + "learning_rate": 0.0002, + "loss": 0.4364, + "step": 9030 + }, + { + "epoch": 6.6299963329666305, + "grad_norm": 1.1791894435882568, + "learning_rate": 0.0002, + "loss": 0.4698, + "step": 9040 + }, + { + "epoch": 6.637330399706637, + "grad_norm": 1.0968137979507446, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 9050 + }, + { + "epoch": 6.6446644664466445, + "grad_norm": 1.5639140605926514, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 9060 + }, + { + "epoch": 6.651998533186652, + "grad_norm": 1.4158905744552612, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 9070 + }, + { + "epoch": 6.6593325999266595, + "grad_norm": 1.2120254039764404, + "learning_rate": 0.0002, + "loss": 0.4619, + "step": 9080 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.1866531372070312, + "learning_rate": 0.0002, + "loss": 0.4564, + "step": 9090 + }, + { + "epoch": 6.6740007334066735, + "grad_norm": 1.2704026699066162, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 9100 + }, + { + "epoch": 6.681334800146681, + "grad_norm": 1.1878353357315063, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 9110 + }, + { + "epoch": 6.6886688668866885, + "grad_norm": 1.193995475769043, + "learning_rate": 0.0002, + "loss": 0.4657, + "step": 9120 + }, + { + "epoch": 6.696002933626696, + "grad_norm": 1.2927545309066772, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 9130 + }, + { + "epoch": 6.703337000366703, + "grad_norm": 1.0770703554153442, + "learning_rate": 0.0002, + "loss": 0.4157, + "step": 9140 + }, + { + "epoch": 6.710671067106711, + "grad_norm": 1.2200851440429688, + "learning_rate": 0.0002, + "loss": 0.4571, + "step": 9150 + }, + { + "epoch": 6.718005133846718, + "grad_norm": 1.293891191482544, + "learning_rate": 0.0002, + "loss": 0.4605, + "step": 9160 + }, + { + "epoch": 6.725339200586725, + "grad_norm": 1.9376052618026733, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 9170 + }, + { + "epoch": 6.732673267326732, + "grad_norm": 1.0353254079818726, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 9180 + }, + { + "epoch": 6.74000733406674, + "grad_norm": 1.1274057626724243, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 9190 + }, + { + "epoch": 6.747341400806747, + "grad_norm": 1.3344064950942993, + "learning_rate": 0.0002, + "loss": 0.4486, + "step": 9200 + }, + { + "epoch": 6.754675467546755, + "grad_norm": 1.303621768951416, + "learning_rate": 0.0002, + "loss": 0.49, + "step": 9210 + }, + { + "epoch": 6.762009534286762, + "grad_norm": 1.2327780723571777, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 9220 + }, + { + "epoch": 6.769343601026769, + "grad_norm": 1.3513109683990479, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 9230 + }, + { + "epoch": 6.776677667766776, + "grad_norm": 1.4762850999832153, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 9240 + }, + { + "epoch": 6.784011734506784, + "grad_norm": 1.0967189073562622, + "learning_rate": 0.0002, + "loss": 0.4181, + "step": 9250 + }, + { + "epoch": 6.791345801246791, + "grad_norm": 0.933936357498169, + "learning_rate": 0.0002, + "loss": 0.4862, + "step": 9260 + }, + { + "epoch": 6.798679867986799, + "grad_norm": 1.065553903579712, + "learning_rate": 0.0002, + "loss": 0.4667, + "step": 9270 + }, + { + "epoch": 6.806013934726806, + "grad_norm": 1.2044163942337036, + "learning_rate": 0.0002, + "loss": 0.5164, + "step": 9280 + }, + { + "epoch": 6.813348001466814, + "grad_norm": 1.404137134552002, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 9290 + }, + { + "epoch": 6.82068206820682, + "grad_norm": 1.4005582332611084, + "learning_rate": 0.0002, + "loss": 0.4442, + "step": 9300 + }, + { + "epoch": 6.828016134946828, + "grad_norm": 1.1771104335784912, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 9310 + }, + { + "epoch": 6.835350201686835, + "grad_norm": 1.191933035850525, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 9320 + }, + { + "epoch": 6.842684268426843, + "grad_norm": 1.3395432233810425, + "learning_rate": 0.0002, + "loss": 0.4733, + "step": 9330 + }, + { + "epoch": 6.85001833516685, + "grad_norm": 1.4145503044128418, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 9340 + }, + { + "epoch": 6.857352401906858, + "grad_norm": 1.1128839254379272, + "learning_rate": 0.0002, + "loss": 0.4872, + "step": 9350 + }, + { + "epoch": 6.864686468646864, + "grad_norm": 1.0771174430847168, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 9360 + }, + { + "epoch": 6.872020535386872, + "grad_norm": 1.1089814901351929, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 9370 + }, + { + "epoch": 6.879354602126879, + "grad_norm": 1.078444004058838, + "learning_rate": 0.0002, + "loss": 0.4854, + "step": 9380 + }, + { + "epoch": 6.886688668866887, + "grad_norm": 1.3676636219024658, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 9390 + }, + { + "epoch": 6.894022735606894, + "grad_norm": 0.8973749876022339, + "learning_rate": 0.0002, + "loss": 0.4854, + "step": 9400 + }, + { + "epoch": 6.901356802346902, + "grad_norm": 1.141552448272705, + "learning_rate": 0.0002, + "loss": 0.4274, + "step": 9410 + }, + { + "epoch": 6.908690869086909, + "grad_norm": 0.8345359563827515, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 9420 + }, + { + "epoch": 6.916024935826916, + "grad_norm": 1.1602197885513306, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 9430 + }, + { + "epoch": 6.923359002566923, + "grad_norm": 1.275466799736023, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 9440 + }, + { + "epoch": 6.930693069306931, + "grad_norm": 0.9186071157455444, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 9450 + }, + { + "epoch": 6.938027136046938, + "grad_norm": 0.9069198966026306, + "learning_rate": 0.0002, + "loss": 0.4604, + "step": 9460 + }, + { + "epoch": 6.945361202786946, + "grad_norm": 1.2331899404525757, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 9470 + }, + { + "epoch": 6.952695269526953, + "grad_norm": 0.8685150742530823, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 9480 + }, + { + "epoch": 6.96002933626696, + "grad_norm": 1.4067939519882202, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 9490 + }, + { + "epoch": 6.967363403006967, + "grad_norm": 1.1864029169082642, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 9500 + }, + { + "epoch": 6.974697469746975, + "grad_norm": 1.3697725534439087, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 9510 + }, + { + "epoch": 6.982031536486982, + "grad_norm": 1.1632893085479736, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 9520 + }, + { + "epoch": 6.9893656032269895, + "grad_norm": 1.1447268724441528, + "learning_rate": 0.0002, + "loss": 0.4526, + "step": 9530 + }, + { + "epoch": 6.996699669966997, + "grad_norm": 1.5017213821411133, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 9540 + }, + { + "epoch": 6.999633296662999, + "eval_loss": 1.4178194999694824, + "eval_runtime": 32.7488, + "eval_samples_per_second": 13.161, + "eval_steps_per_second": 1.649, + "step": 9544 + }, + { + "epoch": 7.0040337367070045, + "grad_norm": 1.110981822013855, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 9550 + }, + { + "epoch": 7.011367803447011, + "grad_norm": 1.2793253660202026, + "learning_rate": 0.0002, + "loss": 0.3475, + "step": 9560 + }, + { + "epoch": 7.0187018701870185, + "grad_norm": 1.1258823871612549, + "learning_rate": 0.0002, + "loss": 0.4022, + "step": 9570 + }, + { + "epoch": 7.026035936927026, + "grad_norm": 1.410486102104187, + "learning_rate": 0.0002, + "loss": 0.3008, + "step": 9580 + }, + { + "epoch": 7.0333700036670335, + "grad_norm": 1.2088500261306763, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 9590 + }, + { + "epoch": 7.040704070407041, + "grad_norm": 0.8303650617599487, + "learning_rate": 0.0002, + "loss": 0.3379, + "step": 9600 + }, + { + "epoch": 7.048038137147048, + "grad_norm": 0.9813525080680847, + "learning_rate": 0.0002, + "loss": 0.3376, + "step": 9610 + }, + { + "epoch": 7.055372203887055, + "grad_norm": 0.9679017066955566, + "learning_rate": 0.0002, + "loss": 0.3202, + "step": 9620 + }, + { + "epoch": 7.0627062706270625, + "grad_norm": 1.1532220840454102, + "learning_rate": 0.0002, + "loss": 0.3287, + "step": 9630 + }, + { + "epoch": 7.07004033736707, + "grad_norm": 1.312053918838501, + "learning_rate": 0.0002, + "loss": 0.3639, + "step": 9640 + }, + { + "epoch": 7.077374404107077, + "grad_norm": 1.0594364404678345, + "learning_rate": 0.0002, + "loss": 0.3278, + "step": 9650 + }, + { + "epoch": 7.084708470847085, + "grad_norm": 1.545080542564392, + "learning_rate": 0.0002, + "loss": 0.3259, + "step": 9660 + }, + { + "epoch": 7.092042537587092, + "grad_norm": 1.1748381853103638, + "learning_rate": 0.0002, + "loss": 0.328, + "step": 9670 + }, + { + "epoch": 7.0993766043271, + "grad_norm": 1.6107453107833862, + "learning_rate": 0.0002, + "loss": 0.3313, + "step": 9680 + }, + { + "epoch": 7.106710671067106, + "grad_norm": 0.9478244185447693, + "learning_rate": 0.0002, + "loss": 0.3469, + "step": 9690 + }, + { + "epoch": 7.114044737807114, + "grad_norm": 1.508410930633545, + "learning_rate": 0.0002, + "loss": 0.3289, + "step": 9700 + }, + { + "epoch": 7.121378804547121, + "grad_norm": 1.3175169229507446, + "learning_rate": 0.0002, + "loss": 0.3077, + "step": 9710 + }, + { + "epoch": 7.128712871287129, + "grad_norm": 1.2631924152374268, + "learning_rate": 0.0002, + "loss": 0.3241, + "step": 9720 + }, + { + "epoch": 7.136046938027136, + "grad_norm": 1.0640755891799927, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 9730 + }, + { + "epoch": 7.143381004767144, + "grad_norm": 1.247279405593872, + "learning_rate": 0.0002, + "loss": 0.3418, + "step": 9740 + }, + { + "epoch": 7.15071507150715, + "grad_norm": 1.2538974285125732, + "learning_rate": 0.0002, + "loss": 0.3385, + "step": 9750 + }, + { + "epoch": 7.158049138247158, + "grad_norm": 1.3157252073287964, + "learning_rate": 0.0002, + "loss": 0.3445, + "step": 9760 + }, + { + "epoch": 7.165383204987165, + "grad_norm": 1.5254220962524414, + "learning_rate": 0.0002, + "loss": 0.3518, + "step": 9770 + }, + { + "epoch": 7.172717271727173, + "grad_norm": 1.0063719749450684, + "learning_rate": 0.0002, + "loss": 0.3575, + "step": 9780 + }, + { + "epoch": 7.18005133846718, + "grad_norm": 0.8030351996421814, + "learning_rate": 0.0002, + "loss": 0.3701, + "step": 9790 + }, + { + "epoch": 7.187385405207188, + "grad_norm": 1.2086257934570312, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 9800 + }, + { + "epoch": 7.194719471947194, + "grad_norm": 1.7020413875579834, + "learning_rate": 0.0002, + "loss": 0.354, + "step": 9810 + }, + { + "epoch": 7.202053538687202, + "grad_norm": 1.2517976760864258, + "learning_rate": 0.0002, + "loss": 0.3242, + "step": 9820 + }, + { + "epoch": 7.209387605427209, + "grad_norm": 1.330505132675171, + "learning_rate": 0.0002, + "loss": 0.3503, + "step": 9830 + }, + { + "epoch": 7.216721672167217, + "grad_norm": 1.1273366212844849, + "learning_rate": 0.0002, + "loss": 0.3349, + "step": 9840 + }, + { + "epoch": 7.224055738907224, + "grad_norm": 1.3738148212432861, + "learning_rate": 0.0002, + "loss": 0.3562, + "step": 9850 + }, + { + "epoch": 7.231389805647232, + "grad_norm": 1.2162928581237793, + "learning_rate": 0.0002, + "loss": 0.3622, + "step": 9860 + }, + { + "epoch": 7.238723872387239, + "grad_norm": 1.743969440460205, + "learning_rate": 0.0002, + "loss": 0.3913, + "step": 9870 + }, + { + "epoch": 7.246057939127246, + "grad_norm": 1.5357484817504883, + "learning_rate": 0.0002, + "loss": 0.3855, + "step": 9880 + }, + { + "epoch": 7.253392005867253, + "grad_norm": 1.342976450920105, + "learning_rate": 0.0002, + "loss": 0.3556, + "step": 9890 + }, + { + "epoch": 7.260726072607261, + "grad_norm": 1.428523302078247, + "learning_rate": 0.0002, + "loss": 0.3787, + "step": 9900 + }, + { + "epoch": 7.268060139347268, + "grad_norm": 1.5631695985794067, + "learning_rate": 0.0002, + "loss": 0.343, + "step": 9910 + }, + { + "epoch": 7.275394206087276, + "grad_norm": 1.192564606666565, + "learning_rate": 0.0002, + "loss": 0.3292, + "step": 9920 + }, + { + "epoch": 7.282728272827283, + "grad_norm": 1.1428006887435913, + "learning_rate": 0.0002, + "loss": 0.324, + "step": 9930 + }, + { + "epoch": 7.29006233956729, + "grad_norm": 1.1959515810012817, + "learning_rate": 0.0002, + "loss": 0.2868, + "step": 9940 + }, + { + "epoch": 7.297396406307297, + "grad_norm": 0.984326958656311, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 9950 + }, + { + "epoch": 7.304730473047305, + "grad_norm": 1.1258848905563354, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 9960 + }, + { + "epoch": 7.312064539787312, + "grad_norm": 1.2445521354675293, + "learning_rate": 0.0002, + "loss": 0.3682, + "step": 9970 + }, + { + "epoch": 7.3193986065273196, + "grad_norm": 1.066351294517517, + "learning_rate": 0.0002, + "loss": 0.4032, + "step": 9980 + }, + { + "epoch": 7.326732673267327, + "grad_norm": 1.3111763000488281, + "learning_rate": 0.0002, + "loss": 0.3689, + "step": 9990 + }, + { + "epoch": 7.334066740007334, + "grad_norm": 1.5113508701324463, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 10000 + }, + { + "epoch": 7.341400806747341, + "grad_norm": 1.2499724626541138, + "learning_rate": 0.0002, + "loss": 0.3668, + "step": 10010 + }, + { + "epoch": 7.3487348734873486, + "grad_norm": 1.1003477573394775, + "learning_rate": 0.0002, + "loss": 0.392, + "step": 10020 + }, + { + "epoch": 7.356068940227356, + "grad_norm": 1.4911425113677979, + "learning_rate": 0.0002, + "loss": 0.4045, + "step": 10030 + }, + { + "epoch": 7.3634030069673635, + "grad_norm": 1.291712999343872, + "learning_rate": 0.0002, + "loss": 0.3892, + "step": 10040 + }, + { + "epoch": 7.370737073707371, + "grad_norm": 1.4001942873001099, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 10050 + }, + { + "epoch": 7.378071140447378, + "grad_norm": 2.015535593032837, + "learning_rate": 0.0002, + "loss": 0.404, + "step": 10060 + }, + { + "epoch": 7.385405207187385, + "grad_norm": 1.3355735540390015, + "learning_rate": 0.0002, + "loss": 0.3758, + "step": 10070 + }, + { + "epoch": 7.3927392739273925, + "grad_norm": 1.1258678436279297, + "learning_rate": 0.0002, + "loss": 0.3764, + "step": 10080 + }, + { + "epoch": 7.4000733406674, + "grad_norm": 1.3883707523345947, + "learning_rate": 0.0002, + "loss": 0.3827, + "step": 10090 + }, + { + "epoch": 7.407407407407407, + "grad_norm": 1.144474744796753, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 10100 + }, + { + "epoch": 7.414741474147415, + "grad_norm": 1.636843204498291, + "learning_rate": 0.0002, + "loss": 0.4024, + "step": 10110 + }, + { + "epoch": 7.422075540887422, + "grad_norm": 1.6167247295379639, + "learning_rate": 0.0002, + "loss": 0.3924, + "step": 10120 + }, + { + "epoch": 7.429409607627429, + "grad_norm": 1.3800078630447388, + "learning_rate": 0.0002, + "loss": 0.4233, + "step": 10130 + }, + { + "epoch": 7.436743674367436, + "grad_norm": 1.2631969451904297, + "learning_rate": 0.0002, + "loss": 0.3859, + "step": 10140 + }, + { + "epoch": 7.444077741107444, + "grad_norm": 1.32834792137146, + "learning_rate": 0.0002, + "loss": 0.3523, + "step": 10150 + }, + { + "epoch": 7.451411807847451, + "grad_norm": 1.370316982269287, + "learning_rate": 0.0002, + "loss": 0.3945, + "step": 10160 + }, + { + "epoch": 7.458745874587459, + "grad_norm": 1.6096234321594238, + "learning_rate": 0.0002, + "loss": 0.3695, + "step": 10170 + }, + { + "epoch": 7.466079941327466, + "grad_norm": 1.3638662099838257, + "learning_rate": 0.0002, + "loss": 0.3378, + "step": 10180 + }, + { + "epoch": 7.473414008067474, + "grad_norm": 1.3508107662200928, + "learning_rate": 0.0002, + "loss": 0.4015, + "step": 10190 + }, + { + "epoch": 7.48074807480748, + "grad_norm": 1.5599194765090942, + "learning_rate": 0.0002, + "loss": 0.4169, + "step": 10200 + }, + { + "epoch": 7.488082141547488, + "grad_norm": 1.4922538995742798, + "learning_rate": 0.0002, + "loss": 0.4071, + "step": 10210 + }, + { + "epoch": 7.495416208287495, + "grad_norm": 1.485437273979187, + "learning_rate": 0.0002, + "loss": 0.419, + "step": 10220 + }, + { + "epoch": 7.502750275027503, + "grad_norm": 0.9040785431861877, + "learning_rate": 0.0002, + "loss": 0.382, + "step": 10230 + }, + { + "epoch": 7.51008434176751, + "grad_norm": 1.2453011274337769, + "learning_rate": 0.0002, + "loss": 0.3437, + "step": 10240 + }, + { + "epoch": 7.517418408507518, + "grad_norm": 1.4167460203170776, + "learning_rate": 0.0002, + "loss": 0.4063, + "step": 10250 + }, + { + "epoch": 7.524752475247524, + "grad_norm": 1.396972894668579, + "learning_rate": 0.0002, + "loss": 0.402, + "step": 10260 + }, + { + "epoch": 7.532086541987532, + "grad_norm": 1.384286880493164, + "learning_rate": 0.0002, + "loss": 0.3658, + "step": 10270 + }, + { + "epoch": 7.539420608727539, + "grad_norm": 1.478095531463623, + "learning_rate": 0.0002, + "loss": 0.3813, + "step": 10280 + }, + { + "epoch": 7.546754675467547, + "grad_norm": 1.2642205953598022, + "learning_rate": 0.0002, + "loss": 0.3813, + "step": 10290 + }, + { + "epoch": 7.554088742207554, + "grad_norm": 1.1110541820526123, + "learning_rate": 0.0002, + "loss": 0.4084, + "step": 10300 + }, + { + "epoch": 7.561422808947562, + "grad_norm": 1.1147890090942383, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 10310 + }, + { + "epoch": 7.568756875687569, + "grad_norm": 1.5677998065948486, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 10320 + }, + { + "epoch": 7.576090942427576, + "grad_norm": 0.9347636699676514, + "learning_rate": 0.0002, + "loss": 0.3675, + "step": 10330 + }, + { + "epoch": 7.583425009167583, + "grad_norm": 1.1126737594604492, + "learning_rate": 0.0002, + "loss": 0.3534, + "step": 10340 + }, + { + "epoch": 7.590759075907591, + "grad_norm": 1.462611198425293, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 10350 + }, + { + "epoch": 7.598093142647598, + "grad_norm": 0.9907522201538086, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 10360 + }, + { + "epoch": 7.605427209387606, + "grad_norm": 1.306152582168579, + "learning_rate": 0.0002, + "loss": 0.4133, + "step": 10370 + }, + { + "epoch": 7.612761276127613, + "grad_norm": 1.11135995388031, + "learning_rate": 0.0002, + "loss": 0.3644, + "step": 10380 + }, + { + "epoch": 7.62009534286762, + "grad_norm": 1.0825806856155396, + "learning_rate": 0.0002, + "loss": 0.3659, + "step": 10390 + }, + { + "epoch": 7.627429409607627, + "grad_norm": 1.5346975326538086, + "learning_rate": 0.0002, + "loss": 0.3952, + "step": 10400 + }, + { + "epoch": 7.634763476347635, + "grad_norm": 1.5885388851165771, + "learning_rate": 0.0002, + "loss": 0.3807, + "step": 10410 + }, + { + "epoch": 7.642097543087642, + "grad_norm": 1.130261778831482, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 10420 + }, + { + "epoch": 7.64943160982765, + "grad_norm": 1.2318342924118042, + "learning_rate": 0.0002, + "loss": 0.3598, + "step": 10430 + }, + { + "epoch": 7.656765676567657, + "grad_norm": 1.07103431224823, + "learning_rate": 0.0002, + "loss": 0.3823, + "step": 10440 + }, + { + "epoch": 7.6640997433076645, + "grad_norm": 1.3836923837661743, + "learning_rate": 0.0002, + "loss": 0.4707, + "step": 10450 + }, + { + "epoch": 7.671433810047671, + "grad_norm": 1.2110271453857422, + "learning_rate": 0.0002, + "loss": 0.3829, + "step": 10460 + }, + { + "epoch": 7.678767876787679, + "grad_norm": 1.2304844856262207, + "learning_rate": 0.0002, + "loss": 0.3747, + "step": 10470 + }, + { + "epoch": 7.686101943527686, + "grad_norm": 1.3444706201553345, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 10480 + }, + { + "epoch": 7.6934360102676935, + "grad_norm": 1.151705026626587, + "learning_rate": 0.0002, + "loss": 0.3772, + "step": 10490 + }, + { + "epoch": 7.700770077007701, + "grad_norm": 1.4373983144760132, + "learning_rate": 0.0002, + "loss": 0.3351, + "step": 10500 + }, + { + "epoch": 7.7081041437477085, + "grad_norm": 1.5898514986038208, + "learning_rate": 0.0002, + "loss": 0.389, + "step": 10510 + }, + { + "epoch": 7.715438210487715, + "grad_norm": 1.3767389059066772, + "learning_rate": 0.0002, + "loss": 0.3679, + "step": 10520 + }, + { + "epoch": 7.7227722772277225, + "grad_norm": 1.3730027675628662, + "learning_rate": 0.0002, + "loss": 0.4023, + "step": 10530 + }, + { + "epoch": 7.73010634396773, + "grad_norm": 1.3917304277420044, + "learning_rate": 0.0002, + "loss": 0.4291, + "step": 10540 + }, + { + "epoch": 7.7374404107077375, + "grad_norm": 1.230663776397705, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 10550 + }, + { + "epoch": 7.744774477447745, + "grad_norm": 0.9922441244125366, + "learning_rate": 0.0002, + "loss": 0.4289, + "step": 10560 + }, + { + "epoch": 7.752108544187752, + "grad_norm": 1.4576551914215088, + "learning_rate": 0.0002, + "loss": 0.4118, + "step": 10570 + }, + { + "epoch": 7.75944261092776, + "grad_norm": 1.061685562133789, + "learning_rate": 0.0002, + "loss": 0.3793, + "step": 10580 + }, + { + "epoch": 7.7667766776677665, + "grad_norm": 1.1523276567459106, + "learning_rate": 0.0002, + "loss": 0.3748, + "step": 10590 + }, + { + "epoch": 7.774110744407774, + "grad_norm": 1.3917267322540283, + "learning_rate": 0.0002, + "loss": 0.3982, + "step": 10600 + }, + { + "epoch": 7.781444811147781, + "grad_norm": 1.1314283609390259, + "learning_rate": 0.0002, + "loss": 0.4305, + "step": 10610 + }, + { + "epoch": 7.788778877887789, + "grad_norm": 1.624324083328247, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 10620 + }, + { + "epoch": 7.796112944627796, + "grad_norm": 1.5369168519973755, + "learning_rate": 0.0002, + "loss": 0.4129, + "step": 10630 + }, + { + "epoch": 7.803447011367804, + "grad_norm": 1.082222819328308, + "learning_rate": 0.0002, + "loss": 0.3661, + "step": 10640 + }, + { + "epoch": 7.81078107810781, + "grad_norm": 1.281540870666504, + "learning_rate": 0.0002, + "loss": 0.4341, + "step": 10650 + }, + { + "epoch": 7.818115144847818, + "grad_norm": 1.1889171600341797, + "learning_rate": 0.0002, + "loss": 0.3939, + "step": 10660 + }, + { + "epoch": 7.825449211587825, + "grad_norm": 1.5310896635055542, + "learning_rate": 0.0002, + "loss": 0.3923, + "step": 10670 + }, + { + "epoch": 7.832783278327833, + "grad_norm": 1.6724708080291748, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 10680 + }, + { + "epoch": 7.84011734506784, + "grad_norm": 1.3742409944534302, + "learning_rate": 0.0002, + "loss": 0.363, + "step": 10690 + }, + { + "epoch": 7.847451411807848, + "grad_norm": 1.2421947717666626, + "learning_rate": 0.0002, + "loss": 0.3599, + "step": 10700 + }, + { + "epoch": 7.854785478547855, + "grad_norm": 1.0233848094940186, + "learning_rate": 0.0002, + "loss": 0.3829, + "step": 10710 + }, + { + "epoch": 7.862119545287862, + "grad_norm": 1.640842080116272, + "learning_rate": 0.0002, + "loss": 0.4265, + "step": 10720 + }, + { + "epoch": 7.869453612027869, + "grad_norm": 1.3571926355361938, + "learning_rate": 0.0002, + "loss": 0.4257, + "step": 10730 + }, + { + "epoch": 7.876787678767877, + "grad_norm": 1.459564208984375, + "learning_rate": 0.0002, + "loss": 0.4454, + "step": 10740 + }, + { + "epoch": 7.884121745507884, + "grad_norm": 0.9202831387519836, + "learning_rate": 0.0002, + "loss": 0.3903, + "step": 10750 + }, + { + "epoch": 7.891455812247892, + "grad_norm": 1.3509176969528198, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 10760 + }, + { + "epoch": 7.898789878987898, + "grad_norm": 1.5858603715896606, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 10770 + }, + { + "epoch": 7.906123945727906, + "grad_norm": 1.2391952276229858, + "learning_rate": 0.0002, + "loss": 0.3753, + "step": 10780 + }, + { + "epoch": 7.913458012467913, + "grad_norm": 1.3442552089691162, + "learning_rate": 0.0002, + "loss": 0.4085, + "step": 10790 + }, + { + "epoch": 7.920792079207921, + "grad_norm": 1.7327884435653687, + "learning_rate": 0.0002, + "loss": 0.4377, + "step": 10800 + }, + { + "epoch": 7.928126145947928, + "grad_norm": 1.4246922731399536, + "learning_rate": 0.0002, + "loss": 0.376, + "step": 10810 + }, + { + "epoch": 7.935460212687936, + "grad_norm": 1.4421411752700806, + "learning_rate": 0.0002, + "loss": 0.4158, + "step": 10820 + }, + { + "epoch": 7.942794279427943, + "grad_norm": 1.3445014953613281, + "learning_rate": 0.0002, + "loss": 0.4084, + "step": 10830 + }, + { + "epoch": 7.950128346167951, + "grad_norm": 1.2219295501708984, + "learning_rate": 0.0002, + "loss": 0.3986, + "step": 10840 + }, + { + "epoch": 7.957462412907957, + "grad_norm": 1.241843342781067, + "learning_rate": 0.0002, + "loss": 0.428, + "step": 10850 + }, + { + "epoch": 7.964796479647965, + "grad_norm": 0.9814007878303528, + "learning_rate": 0.0002, + "loss": 0.3776, + "step": 10860 + }, + { + "epoch": 7.972130546387972, + "grad_norm": 1.4015462398529053, + "learning_rate": 0.0002, + "loss": 0.4866, + "step": 10870 + }, + { + "epoch": 7.97946461312798, + "grad_norm": 1.4638406038284302, + "learning_rate": 0.0002, + "loss": 0.437, + "step": 10880 + }, + { + "epoch": 7.986798679867987, + "grad_norm": 1.585194706916809, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 10890 + }, + { + "epoch": 7.994132746607994, + "grad_norm": 1.197031855583191, + "learning_rate": 0.0002, + "loss": 0.3969, + "step": 10900 + }, + { + "epoch": 7.997066373303997, + "eval_loss": 1.499578595161438, + "eval_runtime": 32.7458, + "eval_samples_per_second": 13.162, + "eval_steps_per_second": 1.649, + "step": 10904 + } + ], + "logging_steps": 10, + "max_steps": 10904, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.0461270495513805e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-10904/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3c36ecbbb47676521c72eea7028470f3e1d4b672 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb3e68c859e7f949d61c2b851f8413912f0844f23f0d7451d5d00de039c04417 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..830ccdecc75c527c4aae4cce853e39ceda3b15f1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8db681f7fd9dfa85d55512999836db447ef2a457bd7b27f0648bbbc56412b607 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..70edec11e7459d3c7aa5439be9a2b2056c65a9ad --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e8c1f88129afe081291492ecda26f8bde107365cff0da0a4ab1c6f1ccb89efa +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..381516e5ded3f51b9d3cf89c73c0cdf6a851cc0d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec792ccb2845954d3473c14cd59e6ae800f3539bd2b48bbf23c7aa849803b8fa +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a565e5777c2ddd8662b5f94ca0ccbbcccf6f4366 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/trainer_state.json @@ -0,0 +1,993 @@ +{ + "best_metric": 1.1822267770767212, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363", + "epoch": 0.9996332966629996, + "eval_steps": 10, + "global_step": 1363, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007334066740007334, + "grad_norm": 0.47521963715553284, + "learning_rate": 0.0002, + "loss": 1.9722, + "step": 10 + }, + { + "epoch": 0.014668133480014669, + "grad_norm": 0.5395162105560303, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 20 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 0.4305780231952667, + "learning_rate": 0.0002, + "loss": 1.4202, + "step": 30 + }, + { + "epoch": 0.029336266960029337, + "grad_norm": 0.6938246488571167, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 40 + }, + { + "epoch": 0.03667033370003667, + "grad_norm": 1.5133819580078125, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 50 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 0.9173883199691772, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 60 + }, + { + "epoch": 0.05133846718005134, + "grad_norm": 0.4619861841201782, + "learning_rate": 0.0002, + "loss": 1.2844, + "step": 70 + }, + { + "epoch": 0.058672533920058674, + "grad_norm": 0.46118637919425964, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 80 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 0.4468648135662079, + "learning_rate": 0.0002, + "loss": 1.3441, + "step": 90 + }, + { + "epoch": 0.07334066740007333, + "grad_norm": 0.46123769879341125, + "learning_rate": 0.0002, + "loss": 1.1863, + "step": 100 + }, + { + "epoch": 0.08067473414008068, + "grad_norm": 0.4859139025211334, + "learning_rate": 0.0002, + "loss": 1.2772, + "step": 110 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 0.4384922385215759, + "learning_rate": 0.0002, + "loss": 1.2087, + "step": 120 + }, + { + "epoch": 0.09534286762009535, + "grad_norm": 0.39519360661506653, + "learning_rate": 0.0002, + "loss": 1.2927, + "step": 130 + }, + { + "epoch": 0.10267693436010268, + "grad_norm": 0.4049859344959259, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 140 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 0.4605638086795807, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 150 + }, + { + "epoch": 0.11734506784011735, + "grad_norm": 0.4201928377151489, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 160 + }, + { + "epoch": 0.12467913458012468, + "grad_norm": 0.5367777347564697, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 170 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 0.41752299666404724, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 180 + }, + { + "epoch": 0.13934726806013933, + "grad_norm": 0.31597763299942017, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 190 + }, + { + "epoch": 0.14668133480014667, + "grad_norm": 0.7468788623809814, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 200 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 0.3403034508228302, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 210 + }, + { + "epoch": 0.16134946828016136, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.1686835350201687, + "grad_norm": 0.356158971786499, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 230 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 0.3448857367038727, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 240 + }, + { + "epoch": 0.18335166850018336, + "grad_norm": 0.3475699722766876, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 250 + }, + { + "epoch": 0.1906857352401907, + "grad_norm": 0.2770358622074127, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 260 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.4310270845890045, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 270 + }, + { + "epoch": 0.20535386872020536, + "grad_norm": 0.335041880607605, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 280 + }, + { + "epoch": 0.2126879354602127, + "grad_norm": 0.3420602083206177, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 290 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 0.325001060962677, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 300 + }, + { + "epoch": 0.22735606894022736, + "grad_norm": 0.3027827739715576, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 310 + }, + { + "epoch": 0.2346901356802347, + "grad_norm": 0.435550719499588, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 320 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 0.3884522616863251, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 330 + }, + { + "epoch": 0.24935826916024936, + "grad_norm": 0.7736002206802368, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 340 + }, + { + "epoch": 0.2566923359002567, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0002, + "loss": 1.3606, + "step": 350 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.3311890959739685, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 360 + }, + { + "epoch": 0.27136046938027136, + "grad_norm": 0.7473500370979309, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 370 + }, + { + "epoch": 0.27869453612027867, + "grad_norm": 0.3681875765323639, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 380 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 0.3764737844467163, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 390 + }, + { + "epoch": 0.29336266960029334, + "grad_norm": 0.4243989586830139, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 400 + }, + { + "epoch": 0.3006967363403007, + "grad_norm": 0.2658531963825226, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 410 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 0.3436793386936188, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.31536486982031536, + "grad_norm": 0.5101129412651062, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 430 + }, + { + "epoch": 0.3226989365603227, + "grad_norm": 0.3319750726222992, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 440 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.385148286819458, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 450 + }, + { + "epoch": 0.3373670700403374, + "grad_norm": 0.3477935791015625, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 460 + }, + { + "epoch": 0.3447011367803447, + "grad_norm": 0.29748716950416565, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 470 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 0.34083324670791626, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 480 + }, + { + "epoch": 0.35936927026035936, + "grad_norm": 0.36904552578926086, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 490 + }, + { + "epoch": 0.3667033370003667, + "grad_norm": 0.315483033657074, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 500 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.44897955656051636, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 510 + }, + { + "epoch": 0.3813714704803814, + "grad_norm": 0.3160701394081116, + "learning_rate": 0.0002, + "loss": 1.3035, + "step": 520 + }, + { + "epoch": 0.3887055372203887, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 530 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5430002808570862, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 540 + }, + { + "epoch": 0.40337367070040336, + "grad_norm": 0.2908070683479309, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 550 + }, + { + "epoch": 0.4107077374404107, + "grad_norm": 0.35066530108451843, + "learning_rate": 0.0002, + "loss": 1.2384, + "step": 560 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.37588003277778625, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 570 + }, + { + "epoch": 0.4253758709204254, + "grad_norm": 0.3112126886844635, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 580 + }, + { + "epoch": 0.4327099376604327, + "grad_norm": 0.35577139258384705, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 590 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.31706422567367554, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 600 + }, + { + "epoch": 0.44737807114044736, + "grad_norm": 0.3249092102050781, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 610 + }, + { + "epoch": 0.4547121378804547, + "grad_norm": 0.3842705488204956, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 620 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.390991747379303, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 630 + }, + { + "epoch": 0.4693802713604694, + "grad_norm": 0.27532413601875305, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 640 + }, + { + "epoch": 0.4767143381004767, + "grad_norm": 0.31412816047668457, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 650 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.32117101550102234, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.49138247158049136, + "grad_norm": 0.3810010254383087, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 670 + }, + { + "epoch": 0.4987165383204987, + "grad_norm": 0.36289164423942566, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 680 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.34458720684051514, + "learning_rate": 0.0002, + "loss": 1.2034, + "step": 690 + }, + { + "epoch": 0.5133846718005134, + "grad_norm": 0.32844600081443787, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 700 + }, + { + "epoch": 0.5207187385405208, + "grad_norm": 0.3144175708293915, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 710 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.3898887634277344, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 720 + }, + { + "epoch": 0.5353868720205354, + "grad_norm": 1.3220758438110352, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 730 + }, + { + "epoch": 0.5427209387605427, + "grad_norm": 0.3635874390602112, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 740 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.3138217628002167, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 750 + }, + { + "epoch": 0.5573890722405573, + "grad_norm": 0.4063207805156708, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.5647231389805647, + "grad_norm": 0.3926219940185547, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 770 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.31954652070999146, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 780 + }, + { + "epoch": 0.5793912724605794, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 790 + }, + { + "epoch": 0.5867253392005867, + "grad_norm": 0.643004834651947, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 800 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.3479592800140381, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 810 + }, + { + "epoch": 0.6013934726806014, + "grad_norm": 0.4684754014015198, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 820 + }, + { + "epoch": 0.6087275394206088, + "grad_norm": 0.3739790916442871, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 830 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.40884748101234436, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 840 + }, + { + "epoch": 0.6233956729006234, + "grad_norm": 0.9722164273262024, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 850 + }, + { + "epoch": 0.6307297396406307, + "grad_norm": 0.42992347478866577, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 860 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.36654195189476013, + "learning_rate": 0.0002, + "loss": 1.1339, + "step": 870 + }, + { + "epoch": 0.6453978731206454, + "grad_norm": 0.4113832116127014, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 880 + }, + { + "epoch": 0.6527319398606527, + "grad_norm": 0.2948838770389557, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 890 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.38330280780792236, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 900 + }, + { + "epoch": 0.6674000733406674, + "grad_norm": 0.4428867697715759, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 910 + }, + { + "epoch": 0.6747341400806748, + "grad_norm": 0.23659265041351318, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 920 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.323685884475708, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 930 + }, + { + "epoch": 0.6894022735606894, + "grad_norm": 0.39157727360725403, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 940 + }, + { + "epoch": 0.6967363403006968, + "grad_norm": 0.27189481258392334, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 950 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.529883861541748, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 960 + }, + { + "epoch": 0.7114044737807114, + "grad_norm": 0.34758689999580383, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 970 + }, + { + "epoch": 0.7187385405207187, + "grad_norm": 0.831749439239502, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 980 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.4438304007053375, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 990 + }, + { + "epoch": 0.7334066740007334, + "grad_norm": 0.33840006589889526, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 1000 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3454797863960266, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1010 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.38999441266059875, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 1020 + }, + { + "epoch": 0.7554088742207554, + "grad_norm": 0.2829911708831787, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1030 + }, + { + "epoch": 0.7627429409607628, + "grad_norm": 0.36918163299560547, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 1040 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3415680229663849, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 1050 + }, + { + "epoch": 0.7774110744407774, + "grad_norm": 0.2974182963371277, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 1060 + }, + { + "epoch": 0.7847451411807848, + "grad_norm": 0.3880919814109802, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 1070 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.33503302931785583, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 1080 + }, + { + "epoch": 0.7994132746607994, + "grad_norm": 0.3728407025337219, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1090 + }, + { + "epoch": 0.8067473414008067, + "grad_norm": 0.3509373664855957, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 1100 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.42228564620018005, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 1110 + }, + { + "epoch": 0.8214154748808215, + "grad_norm": 0.313467800617218, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 1120 + }, + { + "epoch": 0.8287495416208287, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 1130 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.43200382590293884, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 1140 + }, + { + "epoch": 0.8434176751008434, + "grad_norm": 0.3309599459171295, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 1150 + }, + { + "epoch": 0.8507517418408508, + "grad_norm": 0.3526846170425415, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1160 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 1.2722247838974, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 1170 + }, + { + "epoch": 0.8654198753208654, + "grad_norm": 0.34142059087753296, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 1180 + }, + { + "epoch": 0.8727539420608728, + "grad_norm": 0.3805823028087616, + "learning_rate": 0.0002, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.3931232690811157, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 1200 + }, + { + "epoch": 0.8874220755408874, + "grad_norm": 0.2937372624874115, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 1210 + }, + { + "epoch": 0.8947561422808947, + "grad_norm": 0.3757196366786957, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 1220 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.3502705991268158, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 1230 + }, + { + "epoch": 0.9094242757609095, + "grad_norm": 0.32758915424346924, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 1240 + }, + { + "epoch": 0.9167583425009168, + "grad_norm": 0.37199416756629944, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 1250 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.3551490604877472, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 1260 + }, + { + "epoch": 0.9314264759809314, + "grad_norm": 0.2859550714492798, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 1270 + }, + { + "epoch": 0.9387605427209388, + "grad_norm": 0.427990585565567, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 1280 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.33717992901802063, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 1290 + }, + { + "epoch": 0.9534286762009534, + "grad_norm": 0.30225634574890137, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 1300 + }, + { + "epoch": 0.9607627429409608, + "grad_norm": 0.385821133852005, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 1310 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.9754308764209755, + "grad_norm": 0.49987098574638367, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 1330 + }, + { + "epoch": 0.9827649431609827, + "grad_norm": 0.3842747211456299, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 1340 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.6274653673171997, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1350 + }, + { + "epoch": 0.9974330766409975, + "grad_norm": 0.5239808559417725, + "learning_rate": 0.0002, + "loss": 1.124, + "step": 1360 + }, + { + "epoch": 0.9996332966629996, + "eval_loss": 1.1822267770767212, + "eval_runtime": 32.7389, + "eval_samples_per_second": 13.165, + "eval_steps_per_second": 1.649, + "step": 1363 + } + ], + "logging_steps": 10, + "max_steps": 10904, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.309972699984691e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec1ef8d47b111af6b72ef86a5e420ba55ab736f6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18d3065462d28b00e94897958b444dda35b3dd73c28df887bc7c6cfe0ea65800 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa3b50e0d499bce41ed2192265ed261d64261b2f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3192e65fe12343970eafb01e10d7a8147b440f12a84b3ec976ed4c80429062b5 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..76249ea6e890a3dc0c443940bcaaa512d966c49e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc6c2826db31d92be03894f62dd34cd3704f8d386851d4da63f91a3f2227279f +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f5c8f6520190e4e417e0a861746661e0e651c80 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b53869add167718298cc75c10e199544e4e266f7a2d65e010562c09bd1ea51d7 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9b431e9b5c4a3fff5d9580eaa26b9cdf08e2c648 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/trainer_state.json @@ -0,0 +1,1953 @@ +{ + "best_metric": 1.1534006595611572, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2727, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007334066740007334, + "grad_norm": 0.47521963715553284, + "learning_rate": 0.0002, + "loss": 1.9722, + "step": 10 + }, + { + "epoch": 0.014668133480014669, + "grad_norm": 0.5395162105560303, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 20 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 0.4305780231952667, + "learning_rate": 0.0002, + "loss": 1.4202, + "step": 30 + }, + { + "epoch": 0.029336266960029337, + "grad_norm": 0.6938246488571167, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 40 + }, + { + "epoch": 0.03667033370003667, + "grad_norm": 1.5133819580078125, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 50 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 0.9173883199691772, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 60 + }, + { + "epoch": 0.05133846718005134, + "grad_norm": 0.4619861841201782, + "learning_rate": 0.0002, + "loss": 1.2844, + "step": 70 + }, + { + "epoch": 0.058672533920058674, + "grad_norm": 0.46118637919425964, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 80 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 0.4468648135662079, + "learning_rate": 0.0002, + "loss": 1.3441, + "step": 90 + }, + { + "epoch": 0.07334066740007333, + "grad_norm": 0.46123769879341125, + "learning_rate": 0.0002, + "loss": 1.1863, + "step": 100 + }, + { + "epoch": 0.08067473414008068, + "grad_norm": 0.4859139025211334, + "learning_rate": 0.0002, + "loss": 1.2772, + "step": 110 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 0.4384922385215759, + "learning_rate": 0.0002, + "loss": 1.2087, + "step": 120 + }, + { + "epoch": 0.09534286762009535, + "grad_norm": 0.39519360661506653, + "learning_rate": 0.0002, + "loss": 1.2927, + "step": 130 + }, + { + "epoch": 0.10267693436010268, + "grad_norm": 0.4049859344959259, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 140 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 0.4605638086795807, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 150 + }, + { + "epoch": 0.11734506784011735, + "grad_norm": 0.4201928377151489, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 160 + }, + { + "epoch": 0.12467913458012468, + "grad_norm": 0.5367777347564697, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 170 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 0.41752299666404724, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 180 + }, + { + "epoch": 0.13934726806013933, + "grad_norm": 0.31597763299942017, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 190 + }, + { + "epoch": 0.14668133480014667, + "grad_norm": 0.7468788623809814, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 200 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 0.3403034508228302, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 210 + }, + { + "epoch": 0.16134946828016136, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.1686835350201687, + "grad_norm": 0.356158971786499, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 230 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 0.3448857367038727, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 240 + }, + { + "epoch": 0.18335166850018336, + "grad_norm": 0.3475699722766876, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 250 + }, + { + "epoch": 0.1906857352401907, + "grad_norm": 0.2770358622074127, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 260 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.4310270845890045, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 270 + }, + { + "epoch": 0.20535386872020536, + "grad_norm": 0.335041880607605, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 280 + }, + { + "epoch": 0.2126879354602127, + "grad_norm": 0.3420602083206177, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 290 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 0.325001060962677, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 300 + }, + { + "epoch": 0.22735606894022736, + "grad_norm": 0.3027827739715576, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 310 + }, + { + "epoch": 0.2346901356802347, + "grad_norm": 0.435550719499588, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 320 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 0.3884522616863251, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 330 + }, + { + "epoch": 0.24935826916024936, + "grad_norm": 0.7736002206802368, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 340 + }, + { + "epoch": 0.2566923359002567, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0002, + "loss": 1.3606, + "step": 350 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.3311890959739685, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 360 + }, + { + "epoch": 0.27136046938027136, + "grad_norm": 0.7473500370979309, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 370 + }, + { + "epoch": 0.27869453612027867, + "grad_norm": 0.3681875765323639, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 380 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 0.3764737844467163, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 390 + }, + { + "epoch": 0.29336266960029334, + "grad_norm": 0.4243989586830139, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 400 + }, + { + "epoch": 0.3006967363403007, + "grad_norm": 0.2658531963825226, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 410 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 0.3436793386936188, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.31536486982031536, + "grad_norm": 0.5101129412651062, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 430 + }, + { + "epoch": 0.3226989365603227, + "grad_norm": 0.3319750726222992, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 440 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.385148286819458, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 450 + }, + { + "epoch": 0.3373670700403374, + "grad_norm": 0.3477935791015625, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 460 + }, + { + "epoch": 0.3447011367803447, + "grad_norm": 0.29748716950416565, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 470 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 0.34083324670791626, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 480 + }, + { + "epoch": 0.35936927026035936, + "grad_norm": 0.36904552578926086, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 490 + }, + { + "epoch": 0.3667033370003667, + "grad_norm": 0.315483033657074, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 500 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.44897955656051636, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 510 + }, + { + "epoch": 0.3813714704803814, + "grad_norm": 0.3160701394081116, + "learning_rate": 0.0002, + "loss": 1.3035, + "step": 520 + }, + { + "epoch": 0.3887055372203887, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 530 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5430002808570862, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 540 + }, + { + "epoch": 0.40337367070040336, + "grad_norm": 0.2908070683479309, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 550 + }, + { + "epoch": 0.4107077374404107, + "grad_norm": 0.35066530108451843, + "learning_rate": 0.0002, + "loss": 1.2384, + "step": 560 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.37588003277778625, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 570 + }, + { + "epoch": 0.4253758709204254, + "grad_norm": 0.3112126886844635, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 580 + }, + { + "epoch": 0.4327099376604327, + "grad_norm": 0.35577139258384705, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 590 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.31706422567367554, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 600 + }, + { + "epoch": 0.44737807114044736, + "grad_norm": 0.3249092102050781, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 610 + }, + { + "epoch": 0.4547121378804547, + "grad_norm": 0.3842705488204956, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 620 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.390991747379303, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 630 + }, + { + "epoch": 0.4693802713604694, + "grad_norm": 0.27532413601875305, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 640 + }, + { + "epoch": 0.4767143381004767, + "grad_norm": 0.31412816047668457, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 650 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.32117101550102234, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.49138247158049136, + "grad_norm": 0.3810010254383087, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 670 + }, + { + "epoch": 0.4987165383204987, + "grad_norm": 0.36289164423942566, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 680 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.34458720684051514, + "learning_rate": 0.0002, + "loss": 1.2034, + "step": 690 + }, + { + "epoch": 0.5133846718005134, + "grad_norm": 0.32844600081443787, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 700 + }, + { + "epoch": 0.5207187385405208, + "grad_norm": 0.3144175708293915, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 710 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.3898887634277344, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 720 + }, + { + "epoch": 0.5353868720205354, + "grad_norm": 1.3220758438110352, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 730 + }, + { + "epoch": 0.5427209387605427, + "grad_norm": 0.3635874390602112, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 740 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.3138217628002167, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 750 + }, + { + "epoch": 0.5573890722405573, + "grad_norm": 0.4063207805156708, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.5647231389805647, + "grad_norm": 0.3926219940185547, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 770 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.31954652070999146, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 780 + }, + { + "epoch": 0.5793912724605794, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 790 + }, + { + "epoch": 0.5867253392005867, + "grad_norm": 0.643004834651947, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 800 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.3479592800140381, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 810 + }, + { + "epoch": 0.6013934726806014, + "grad_norm": 0.4684754014015198, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 820 + }, + { + "epoch": 0.6087275394206088, + "grad_norm": 0.3739790916442871, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 830 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.40884748101234436, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 840 + }, + { + "epoch": 0.6233956729006234, + "grad_norm": 0.9722164273262024, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 850 + }, + { + "epoch": 0.6307297396406307, + "grad_norm": 0.42992347478866577, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 860 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.36654195189476013, + "learning_rate": 0.0002, + "loss": 1.1339, + "step": 870 + }, + { + "epoch": 0.6453978731206454, + "grad_norm": 0.4113832116127014, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 880 + }, + { + "epoch": 0.6527319398606527, + "grad_norm": 0.2948838770389557, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 890 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.38330280780792236, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 900 + }, + { + "epoch": 0.6674000733406674, + "grad_norm": 0.4428867697715759, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 910 + }, + { + "epoch": 0.6747341400806748, + "grad_norm": 0.23659265041351318, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 920 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.323685884475708, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 930 + }, + { + "epoch": 0.6894022735606894, + "grad_norm": 0.39157727360725403, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 940 + }, + { + "epoch": 0.6967363403006968, + "grad_norm": 0.27189481258392334, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 950 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.529883861541748, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 960 + }, + { + "epoch": 0.7114044737807114, + "grad_norm": 0.34758689999580383, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 970 + }, + { + "epoch": 0.7187385405207187, + "grad_norm": 0.831749439239502, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 980 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.4438304007053375, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 990 + }, + { + "epoch": 0.7334066740007334, + "grad_norm": 0.33840006589889526, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 1000 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3454797863960266, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1010 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.38999441266059875, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 1020 + }, + { + "epoch": 0.7554088742207554, + "grad_norm": 0.2829911708831787, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1030 + }, + { + "epoch": 0.7627429409607628, + "grad_norm": 0.36918163299560547, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 1040 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3415680229663849, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 1050 + }, + { + "epoch": 0.7774110744407774, + "grad_norm": 0.2974182963371277, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 1060 + }, + { + "epoch": 0.7847451411807848, + "grad_norm": 0.3880919814109802, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 1070 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.33503302931785583, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 1080 + }, + { + "epoch": 0.7994132746607994, + "grad_norm": 0.3728407025337219, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1090 + }, + { + "epoch": 0.8067473414008067, + "grad_norm": 0.3509373664855957, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 1100 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.42228564620018005, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 1110 + }, + { + "epoch": 0.8214154748808215, + "grad_norm": 0.313467800617218, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 1120 + }, + { + "epoch": 0.8287495416208287, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 1130 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.43200382590293884, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 1140 + }, + { + "epoch": 0.8434176751008434, + "grad_norm": 0.3309599459171295, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 1150 + }, + { + "epoch": 0.8507517418408508, + "grad_norm": 0.3526846170425415, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1160 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 1.2722247838974, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 1170 + }, + { + "epoch": 0.8654198753208654, + "grad_norm": 0.34142059087753296, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 1180 + }, + { + "epoch": 0.8727539420608728, + "grad_norm": 0.3805823028087616, + "learning_rate": 0.0002, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.3931232690811157, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 1200 + }, + { + "epoch": 0.8874220755408874, + "grad_norm": 0.2937372624874115, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 1210 + }, + { + "epoch": 0.8947561422808947, + "grad_norm": 0.3757196366786957, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 1220 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.3502705991268158, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 1230 + }, + { + "epoch": 0.9094242757609095, + "grad_norm": 0.32758915424346924, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 1240 + }, + { + "epoch": 0.9167583425009168, + "grad_norm": 0.37199416756629944, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 1250 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.3551490604877472, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 1260 + }, + { + "epoch": 0.9314264759809314, + "grad_norm": 0.2859550714492798, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 1270 + }, + { + "epoch": 0.9387605427209388, + "grad_norm": 0.427990585565567, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 1280 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.33717992901802063, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 1290 + }, + { + "epoch": 0.9534286762009534, + "grad_norm": 0.30225634574890137, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 1300 + }, + { + "epoch": 0.9607627429409608, + "grad_norm": 0.385821133852005, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 1310 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.9754308764209755, + "grad_norm": 0.49987098574638367, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 1330 + }, + { + "epoch": 0.9827649431609827, + "grad_norm": 0.3842747211456299, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 1340 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.6274653673171997, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1350 + }, + { + "epoch": 0.9974330766409975, + "grad_norm": 0.5239808559417725, + "learning_rate": 0.0002, + "loss": 1.124, + "step": 1360 + }, + { + "epoch": 0.9996332966629996, + "eval_loss": 1.1822267770767212, + "eval_runtime": 32.7389, + "eval_samples_per_second": 13.165, + "eval_steps_per_second": 1.649, + "step": 1363 + }, + { + "epoch": 1.0047671433810048, + "grad_norm": 0.45311301946640015, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 1370 + }, + { + "epoch": 1.012101210121012, + "grad_norm": 0.29685574769973755, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1380 + }, + { + "epoch": 1.0194352768610195, + "grad_norm": 0.3290937840938568, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 1390 + }, + { + "epoch": 1.0267693436010268, + "grad_norm": 0.3801758587360382, + "learning_rate": 0.0002, + "loss": 1.0295, + "step": 1400 + }, + { + "epoch": 1.034103410341034, + "grad_norm": 0.794174313545227, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 1410 + }, + { + "epoch": 1.0414374770810415, + "grad_norm": 0.3854154646396637, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 1420 + }, + { + "epoch": 1.0487715438210488, + "grad_norm": 0.32702451944351196, + "learning_rate": 0.0002, + "loss": 1.0652, + "step": 1430 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 0.7815203666687012, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 1440 + }, + { + "epoch": 1.0634396773010635, + "grad_norm": 0.3087436854839325, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1450 + }, + { + "epoch": 1.0707737440410707, + "grad_norm": 0.3847602903842926, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 1460 + }, + { + "epoch": 1.0781078107810782, + "grad_norm": 0.3693031370639801, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1470 + }, + { + "epoch": 1.0854418775210855, + "grad_norm": 0.4111202359199524, + "learning_rate": 0.0002, + "loss": 1.0995, + "step": 1480 + }, + { + "epoch": 1.0927759442610927, + "grad_norm": 0.41452381014823914, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 1490 + }, + { + "epoch": 1.1001100110011002, + "grad_norm": 0.3336445093154907, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 1500 + }, + { + "epoch": 1.1074440777411074, + "grad_norm": 0.3923407793045044, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 1510 + }, + { + "epoch": 1.1147781444811147, + "grad_norm": 0.46215683221817017, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 1520 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.3592156767845154, + "learning_rate": 0.0002, + "loss": 1.1133, + "step": 1530 + }, + { + "epoch": 1.1294462779611294, + "grad_norm": 0.361110657453537, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 1540 + }, + { + "epoch": 1.1367803447011369, + "grad_norm": 0.5317131280899048, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 1550 + }, + { + "epoch": 1.1441144114411441, + "grad_norm": 0.3882388174533844, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 1560 + }, + { + "epoch": 1.1514484781811514, + "grad_norm": 0.3259428143501282, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 1570 + }, + { + "epoch": 1.1587825449211588, + "grad_norm": 0.410935640335083, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 1580 + }, + { + "epoch": 1.166116611661166, + "grad_norm": 0.44940185546875, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 1590 + }, + { + "epoch": 1.1734506784011733, + "grad_norm": 0.5106484293937683, + "learning_rate": 0.0002, + "loss": 1.0334, + "step": 1600 + }, + { + "epoch": 1.1807847451411808, + "grad_norm": 0.6603665947914124, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 1610 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.4799964129924774, + "learning_rate": 0.0002, + "loss": 1.1227, + "step": 1620 + }, + { + "epoch": 1.1954528786211955, + "grad_norm": 0.4389883279800415, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 1630 + }, + { + "epoch": 1.2027869453612028, + "grad_norm": 0.4188813269138336, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 1640 + }, + { + "epoch": 1.21012101210121, + "grad_norm": 0.7132157683372498, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 1650 + }, + { + "epoch": 1.2174550788412175, + "grad_norm": 0.507480263710022, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1660 + }, + { + "epoch": 1.2247891455812248, + "grad_norm": 0.9452332854270935, + "learning_rate": 0.0002, + "loss": 0.9948, + "step": 1670 + }, + { + "epoch": 1.2321232123212322, + "grad_norm": 0.4121614992618561, + "learning_rate": 0.0002, + "loss": 1.0228, + "step": 1680 + }, + { + "epoch": 1.2394572790612395, + "grad_norm": 0.34230247139930725, + "learning_rate": 0.0002, + "loss": 1.0366, + "step": 1690 + }, + { + "epoch": 1.2467913458012467, + "grad_norm": 0.4026208817958832, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 1700 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.46673697233200073, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1710 + }, + { + "epoch": 1.2614594792812615, + "grad_norm": 0.38349825143814087, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 1720 + }, + { + "epoch": 1.2687935460212687, + "grad_norm": 0.4049997627735138, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 1730 + }, + { + "epoch": 1.2761276127612762, + "grad_norm": 0.3417615294456482, + "learning_rate": 0.0002, + "loss": 0.9504, + "step": 1740 + }, + { + "epoch": 1.2834616795012834, + "grad_norm": 0.4277614951133728, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 1750 + }, + { + "epoch": 1.2907957462412907, + "grad_norm": 0.5864202976226807, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 1760 + }, + { + "epoch": 1.2981298129812981, + "grad_norm": 0.7097493410110474, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 1770 + }, + { + "epoch": 1.3054638797213054, + "grad_norm": 0.3145381212234497, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 1780 + }, + { + "epoch": 1.3127979464613129, + "grad_norm": 0.5116165280342102, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 1790 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 0.7469736337661743, + "learning_rate": 0.0002, + "loss": 1.0765, + "step": 1800 + }, + { + "epoch": 1.3274660799413276, + "grad_norm": 0.32272255420684814, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 1810 + }, + { + "epoch": 1.3348001466813348, + "grad_norm": 0.3534623086452484, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 1820 + }, + { + "epoch": 1.342134213421342, + "grad_norm": 0.36127907037734985, + "learning_rate": 0.0002, + "loss": 1.1628, + "step": 1830 + }, + { + "epoch": 1.3494682801613496, + "grad_norm": 0.4072401523590088, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 1840 + }, + { + "epoch": 1.3568023469013568, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 1850 + }, + { + "epoch": 1.364136413641364, + "grad_norm": 0.412883460521698, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 1860 + }, + { + "epoch": 1.3714704803813715, + "grad_norm": 0.3735875189304352, + "learning_rate": 0.0002, + "loss": 1.0265, + "step": 1870 + }, + { + "epoch": 1.3788045471213788, + "grad_norm": 0.39158159494400024, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 1880 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.44431769847869873, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 1890 + }, + { + "epoch": 1.3934726806013935, + "grad_norm": 0.37772801518440247, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 1900 + }, + { + "epoch": 1.4008067473414008, + "grad_norm": 0.4056641757488251, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 1910 + }, + { + "epoch": 1.408140814081408, + "grad_norm": 0.41612377762794495, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 1920 + }, + { + "epoch": 1.4154748808214155, + "grad_norm": 0.41153013706207275, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 1930 + }, + { + "epoch": 1.4228089475614227, + "grad_norm": 0.387845516204834, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1940 + }, + { + "epoch": 1.4301430143014302, + "grad_norm": 0.3809587061405182, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 1950 + }, + { + "epoch": 1.4374770810414375, + "grad_norm": 0.3625726103782654, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1960 + }, + { + "epoch": 1.444811147781445, + "grad_norm": 0.5294290781021118, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1970 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 0.39975494146347046, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 1980 + }, + { + "epoch": 1.4594792812614594, + "grad_norm": 0.4181167185306549, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 1990 + }, + { + "epoch": 1.466813348001467, + "grad_norm": 0.42001503705978394, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 2000 + }, + { + "epoch": 1.4741474147414741, + "grad_norm": 0.4877578616142273, + "learning_rate": 0.0002, + "loss": 1.1266, + "step": 2010 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.4050969183444977, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 2020 + }, + { + "epoch": 1.4888155482214889, + "grad_norm": 0.39068883657455444, + "learning_rate": 0.0002, + "loss": 1.0562, + "step": 2030 + }, + { + "epoch": 1.4961496149614961, + "grad_norm": 0.421282559633255, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 2040 + }, + { + "epoch": 1.5034836817015034, + "grad_norm": 0.47092297673225403, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 2050 + }, + { + "epoch": 1.5108177484415108, + "grad_norm": 0.39688974618911743, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 2060 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.5529879331588745, + "learning_rate": 0.0002, + "loss": 1.08, + "step": 2070 + }, + { + "epoch": 1.5254858819215253, + "grad_norm": 0.4879782199859619, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 2080 + }, + { + "epoch": 1.5328199486615328, + "grad_norm": 0.5517361164093018, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 2090 + }, + { + "epoch": 1.5401540154015403, + "grad_norm": 0.44015637040138245, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2100 + }, + { + "epoch": 1.5474880821415475, + "grad_norm": 0.5435167551040649, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 2110 + }, + { + "epoch": 1.5548221488815548, + "grad_norm": 0.5714033246040344, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 2120 + }, + { + "epoch": 1.5621562156215623, + "grad_norm": 0.31732529401779175, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 2130 + }, + { + "epoch": 1.5694902823615695, + "grad_norm": 0.49068278074264526, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 2140 + }, + { + "epoch": 1.5768243491015768, + "grad_norm": 0.46851542592048645, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 2150 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.5083092451095581, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 2160 + }, + { + "epoch": 1.5914924825815915, + "grad_norm": 0.9822936058044434, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 2170 + }, + { + "epoch": 1.5988265493215987, + "grad_norm": 0.4575989246368408, + "learning_rate": 0.0002, + "loss": 0.9986, + "step": 2180 + }, + { + "epoch": 1.6061606160616062, + "grad_norm": 0.47444286942481995, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 2190 + }, + { + "epoch": 1.6134946828016135, + "grad_norm": 0.7208226919174194, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 2200 + }, + { + "epoch": 1.6208287495416207, + "grad_norm": 0.43791481852531433, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 2210 + }, + { + "epoch": 1.6281628162816282, + "grad_norm": 0.5245792865753174, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 2220 + }, + { + "epoch": 1.6354968830216357, + "grad_norm": 0.39289429783821106, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 2230 + }, + { + "epoch": 1.6428309497616427, + "grad_norm": 0.6106135845184326, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 2240 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.3722580671310425, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 2250 + }, + { + "epoch": 1.6574990832416576, + "grad_norm": 0.3649403750896454, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2260 + }, + { + "epoch": 1.6648331499816649, + "grad_norm": 0.46514248847961426, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 2270 + }, + { + "epoch": 1.6721672167216721, + "grad_norm": 0.42034927010536194, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 2280 + }, + { + "epoch": 1.6795012834616796, + "grad_norm": 0.45202910900115967, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 2290 + }, + { + "epoch": 1.6868353502016868, + "grad_norm": 0.36257603764533997, + "learning_rate": 0.0002, + "loss": 1.0866, + "step": 2300 + }, + { + "epoch": 1.694169416941694, + "grad_norm": 0.6340323090553284, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 2310 + }, + { + "epoch": 1.7015034836817016, + "grad_norm": 0.4352878928184509, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 2320 + }, + { + "epoch": 1.7088375504217088, + "grad_norm": 0.45029792189598083, + "learning_rate": 0.0002, + "loss": 1.0629, + "step": 2330 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.3891315758228302, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 2340 + }, + { + "epoch": 1.7235056839017235, + "grad_norm": 0.35180050134658813, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2350 + }, + { + "epoch": 1.7308397506417308, + "grad_norm": 0.42367449402809143, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 2360 + }, + { + "epoch": 1.738173817381738, + "grad_norm": 0.4553675353527069, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 2370 + }, + { + "epoch": 1.7455078841217455, + "grad_norm": 0.5944654941558838, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 2380 + }, + { + "epoch": 1.752841950861753, + "grad_norm": 0.3479664623737335, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 2390 + }, + { + "epoch": 1.76017601760176, + "grad_norm": 0.3585502505302429, + "learning_rate": 0.0002, + "loss": 1.0798, + "step": 2400 + }, + { + "epoch": 1.7675100843417675, + "grad_norm": 0.4263346493244171, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 2410 + }, + { + "epoch": 1.774844151081775, + "grad_norm": 0.5476409196853638, + "learning_rate": 0.0002, + "loss": 1.054, + "step": 2420 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 0.3694186508655548, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 2430 + }, + { + "epoch": 1.7895122845617895, + "grad_norm": 0.9185658693313599, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 2440 + }, + { + "epoch": 1.796846351301797, + "grad_norm": 0.7171908020973206, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2450 + }, + { + "epoch": 1.8041804180418042, + "grad_norm": 0.550658643245697, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 2460 + }, + { + "epoch": 1.8115144847818114, + "grad_norm": 0.4075568914413452, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2470 + }, + { + "epoch": 1.818848551521819, + "grad_norm": 0.3790127635002136, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 2480 + }, + { + "epoch": 1.8261826182618262, + "grad_norm": 0.3576384484767914, + "learning_rate": 0.0002, + "loss": 0.9839, + "step": 2490 + }, + { + "epoch": 1.8335166850018334, + "grad_norm": 0.3919370770454407, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 2500 + }, + { + "epoch": 1.8408507517418409, + "grad_norm": 0.485083669424057, + "learning_rate": 0.0002, + "loss": 0.9985, + "step": 2510 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.4564347565174103, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 2520 + }, + { + "epoch": 1.8555188852218554, + "grad_norm": 0.3613106608390808, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 2530 + }, + { + "epoch": 1.8628529519618628, + "grad_norm": 0.39600759744644165, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 2540 + }, + { + "epoch": 1.8701870187018703, + "grad_norm": 1.123499870300293, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 2550 + }, + { + "epoch": 1.8775210854418776, + "grad_norm": 0.4612680673599243, + "learning_rate": 0.0002, + "loss": 1.0635, + "step": 2560 + }, + { + "epoch": 1.8848551521818848, + "grad_norm": 0.42745399475097656, + "learning_rate": 0.0002, + "loss": 1.0087, + "step": 2570 + }, + { + "epoch": 1.8921892189218923, + "grad_norm": 0.4055580198764801, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 2580 + }, + { + "epoch": 1.8995232856618995, + "grad_norm": 0.44174644351005554, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 2590 + }, + { + "epoch": 1.9068573524019068, + "grad_norm": 1.0228385925292969, + "learning_rate": 0.0002, + "loss": 0.9886, + "step": 2600 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.3496396243572235, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 2610 + }, + { + "epoch": 1.9215254858819215, + "grad_norm": 0.4191173017024994, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2620 + }, + { + "epoch": 1.9288595526219288, + "grad_norm": 0.6778554916381836, + "learning_rate": 0.0002, + "loss": 1.0943, + "step": 2630 + }, + { + "epoch": 1.9361936193619362, + "grad_norm": 0.41992834210395813, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 2640 + }, + { + "epoch": 1.9435276861019435, + "grad_norm": 0.8760401010513306, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 2650 + }, + { + "epoch": 1.9508617528419507, + "grad_norm": 0.44049209356307983, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 2660 + }, + { + "epoch": 1.9581958195819582, + "grad_norm": 0.5651928782463074, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 2670 + }, + { + "epoch": 1.9655298863219657, + "grad_norm": 0.5292727947235107, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 2680 + }, + { + "epoch": 1.9728639530619727, + "grad_norm": 0.6012240648269653, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 2690 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.3945149779319763, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 2700 + }, + { + "epoch": 1.9875320865419877, + "grad_norm": 0.5732627511024475, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 2710 + }, + { + "epoch": 1.994866153281995, + "grad_norm": 0.3963361084461212, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 2720 + }, + { + "epoch": 2.0, + "eval_loss": 1.1534006595611572, + "eval_runtime": 32.7541, + "eval_samples_per_second": 13.159, + "eval_steps_per_second": 1.649, + "step": 2727 + } + ], + "logging_steps": 10, + "max_steps": 10904, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2619945399969382e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5682d88ee2c00657eba0aa767d87c4044865cd38 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef302d6be463e56e29f4b08958ebf19ddc0559b0548528c413818f5e2c5a33cc +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fb0b79ff10e1abd1e4528cbccf466e58c08eafd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7527c1f97e0d3f127cfe7f87b8021637522d12b2548963523706830f16c9912e +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6442894d3deec9f7988bcb5bdf90a11d8f0f19f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b56c7af373f43eefb93705238cb39dbcfbe71e194b625d87c287dda9ba71d602 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a1e4ab150dec5b8006a44a3808cfd38f38bee8c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8935eb574027b699cfe689b5551ebf631eefee73b80f79698d1ad69b75c19ff0 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9a64707a750ae30fed316e5ed852d68a0db13eeb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/trainer_state.json @@ -0,0 +1,2920 @@ +{ + "best_metric": 1.1534006595611572, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", + "epoch": 2.9996332966629997, + "eval_steps": 10, + "global_step": 4090, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007334066740007334, + "grad_norm": 0.47521963715553284, + "learning_rate": 0.0002, + "loss": 1.9722, + "step": 10 + }, + { + "epoch": 0.014668133480014669, + "grad_norm": 0.5395162105560303, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 20 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 0.4305780231952667, + "learning_rate": 0.0002, + "loss": 1.4202, + "step": 30 + }, + { + "epoch": 0.029336266960029337, + "grad_norm": 0.6938246488571167, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 40 + }, + { + "epoch": 0.03667033370003667, + "grad_norm": 1.5133819580078125, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 50 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 0.9173883199691772, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 60 + }, + { + "epoch": 0.05133846718005134, + "grad_norm": 0.4619861841201782, + "learning_rate": 0.0002, + "loss": 1.2844, + "step": 70 + }, + { + "epoch": 0.058672533920058674, + "grad_norm": 0.46118637919425964, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 80 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 0.4468648135662079, + "learning_rate": 0.0002, + "loss": 1.3441, + "step": 90 + }, + { + "epoch": 0.07334066740007333, + "grad_norm": 0.46123769879341125, + "learning_rate": 0.0002, + "loss": 1.1863, + "step": 100 + }, + { + "epoch": 0.08067473414008068, + "grad_norm": 0.4859139025211334, + "learning_rate": 0.0002, + "loss": 1.2772, + "step": 110 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 0.4384922385215759, + "learning_rate": 0.0002, + "loss": 1.2087, + "step": 120 + }, + { + "epoch": 0.09534286762009535, + "grad_norm": 0.39519360661506653, + "learning_rate": 0.0002, + "loss": 1.2927, + "step": 130 + }, + { + "epoch": 0.10267693436010268, + "grad_norm": 0.4049859344959259, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 140 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 0.4605638086795807, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 150 + }, + { + "epoch": 0.11734506784011735, + "grad_norm": 0.4201928377151489, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 160 + }, + { + "epoch": 0.12467913458012468, + "grad_norm": 0.5367777347564697, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 170 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 0.41752299666404724, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 180 + }, + { + "epoch": 0.13934726806013933, + "grad_norm": 0.31597763299942017, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 190 + }, + { + "epoch": 0.14668133480014667, + "grad_norm": 0.7468788623809814, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 200 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 0.3403034508228302, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 210 + }, + { + "epoch": 0.16134946828016136, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.1686835350201687, + "grad_norm": 0.356158971786499, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 230 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 0.3448857367038727, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 240 + }, + { + "epoch": 0.18335166850018336, + "grad_norm": 0.3475699722766876, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 250 + }, + { + "epoch": 0.1906857352401907, + "grad_norm": 0.2770358622074127, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 260 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.4310270845890045, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 270 + }, + { + "epoch": 0.20535386872020536, + "grad_norm": 0.335041880607605, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 280 + }, + { + "epoch": 0.2126879354602127, + "grad_norm": 0.3420602083206177, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 290 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 0.325001060962677, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 300 + }, + { + "epoch": 0.22735606894022736, + "grad_norm": 0.3027827739715576, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 310 + }, + { + "epoch": 0.2346901356802347, + "grad_norm": 0.435550719499588, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 320 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 0.3884522616863251, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 330 + }, + { + "epoch": 0.24935826916024936, + "grad_norm": 0.7736002206802368, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 340 + }, + { + "epoch": 0.2566923359002567, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0002, + "loss": 1.3606, + "step": 350 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.3311890959739685, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 360 + }, + { + "epoch": 0.27136046938027136, + "grad_norm": 0.7473500370979309, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 370 + }, + { + "epoch": 0.27869453612027867, + "grad_norm": 0.3681875765323639, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 380 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 0.3764737844467163, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 390 + }, + { + "epoch": 0.29336266960029334, + "grad_norm": 0.4243989586830139, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 400 + }, + { + "epoch": 0.3006967363403007, + "grad_norm": 0.2658531963825226, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 410 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 0.3436793386936188, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.31536486982031536, + "grad_norm": 0.5101129412651062, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 430 + }, + { + "epoch": 0.3226989365603227, + "grad_norm": 0.3319750726222992, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 440 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.385148286819458, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 450 + }, + { + "epoch": 0.3373670700403374, + "grad_norm": 0.3477935791015625, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 460 + }, + { + "epoch": 0.3447011367803447, + "grad_norm": 0.29748716950416565, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 470 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 0.34083324670791626, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 480 + }, + { + "epoch": 0.35936927026035936, + "grad_norm": 0.36904552578926086, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 490 + }, + { + "epoch": 0.3667033370003667, + "grad_norm": 0.315483033657074, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 500 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.44897955656051636, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 510 + }, + { + "epoch": 0.3813714704803814, + "grad_norm": 0.3160701394081116, + "learning_rate": 0.0002, + "loss": 1.3035, + "step": 520 + }, + { + "epoch": 0.3887055372203887, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 530 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5430002808570862, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 540 + }, + { + "epoch": 0.40337367070040336, + "grad_norm": 0.2908070683479309, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 550 + }, + { + "epoch": 0.4107077374404107, + "grad_norm": 0.35066530108451843, + "learning_rate": 0.0002, + "loss": 1.2384, + "step": 560 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.37588003277778625, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 570 + }, + { + "epoch": 0.4253758709204254, + "grad_norm": 0.3112126886844635, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 580 + }, + { + "epoch": 0.4327099376604327, + "grad_norm": 0.35577139258384705, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 590 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.31706422567367554, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 600 + }, + { + "epoch": 0.44737807114044736, + "grad_norm": 0.3249092102050781, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 610 + }, + { + "epoch": 0.4547121378804547, + "grad_norm": 0.3842705488204956, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 620 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.390991747379303, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 630 + }, + { + "epoch": 0.4693802713604694, + "grad_norm": 0.27532413601875305, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 640 + }, + { + "epoch": 0.4767143381004767, + "grad_norm": 0.31412816047668457, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 650 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.32117101550102234, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.49138247158049136, + "grad_norm": 0.3810010254383087, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 670 + }, + { + "epoch": 0.4987165383204987, + "grad_norm": 0.36289164423942566, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 680 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.34458720684051514, + "learning_rate": 0.0002, + "loss": 1.2034, + "step": 690 + }, + { + "epoch": 0.5133846718005134, + "grad_norm": 0.32844600081443787, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 700 + }, + { + "epoch": 0.5207187385405208, + "grad_norm": 0.3144175708293915, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 710 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.3898887634277344, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 720 + }, + { + "epoch": 0.5353868720205354, + "grad_norm": 1.3220758438110352, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 730 + }, + { + "epoch": 0.5427209387605427, + "grad_norm": 0.3635874390602112, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 740 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.3138217628002167, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 750 + }, + { + "epoch": 0.5573890722405573, + "grad_norm": 0.4063207805156708, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.5647231389805647, + "grad_norm": 0.3926219940185547, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 770 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.31954652070999146, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 780 + }, + { + "epoch": 0.5793912724605794, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 790 + }, + { + "epoch": 0.5867253392005867, + "grad_norm": 0.643004834651947, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 800 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.3479592800140381, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 810 + }, + { + "epoch": 0.6013934726806014, + "grad_norm": 0.4684754014015198, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 820 + }, + { + "epoch": 0.6087275394206088, + "grad_norm": 0.3739790916442871, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 830 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.40884748101234436, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 840 + }, + { + "epoch": 0.6233956729006234, + "grad_norm": 0.9722164273262024, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 850 + }, + { + "epoch": 0.6307297396406307, + "grad_norm": 0.42992347478866577, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 860 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.36654195189476013, + "learning_rate": 0.0002, + "loss": 1.1339, + "step": 870 + }, + { + "epoch": 0.6453978731206454, + "grad_norm": 0.4113832116127014, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 880 + }, + { + "epoch": 0.6527319398606527, + "grad_norm": 0.2948838770389557, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 890 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.38330280780792236, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 900 + }, + { + "epoch": 0.6674000733406674, + "grad_norm": 0.4428867697715759, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 910 + }, + { + "epoch": 0.6747341400806748, + "grad_norm": 0.23659265041351318, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 920 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.323685884475708, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 930 + }, + { + "epoch": 0.6894022735606894, + "grad_norm": 0.39157727360725403, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 940 + }, + { + "epoch": 0.6967363403006968, + "grad_norm": 0.27189481258392334, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 950 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.529883861541748, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 960 + }, + { + "epoch": 0.7114044737807114, + "grad_norm": 0.34758689999580383, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 970 + }, + { + "epoch": 0.7187385405207187, + "grad_norm": 0.831749439239502, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 980 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.4438304007053375, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 990 + }, + { + "epoch": 0.7334066740007334, + "grad_norm": 0.33840006589889526, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 1000 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3454797863960266, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1010 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.38999441266059875, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 1020 + }, + { + "epoch": 0.7554088742207554, + "grad_norm": 0.2829911708831787, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1030 + }, + { + "epoch": 0.7627429409607628, + "grad_norm": 0.36918163299560547, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 1040 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3415680229663849, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 1050 + }, + { + "epoch": 0.7774110744407774, + "grad_norm": 0.2974182963371277, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 1060 + }, + { + "epoch": 0.7847451411807848, + "grad_norm": 0.3880919814109802, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 1070 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.33503302931785583, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 1080 + }, + { + "epoch": 0.7994132746607994, + "grad_norm": 0.3728407025337219, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1090 + }, + { + "epoch": 0.8067473414008067, + "grad_norm": 0.3509373664855957, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 1100 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.42228564620018005, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 1110 + }, + { + "epoch": 0.8214154748808215, + "grad_norm": 0.313467800617218, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 1120 + }, + { + "epoch": 0.8287495416208287, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 1130 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.43200382590293884, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 1140 + }, + { + "epoch": 0.8434176751008434, + "grad_norm": 0.3309599459171295, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 1150 + }, + { + "epoch": 0.8507517418408508, + "grad_norm": 0.3526846170425415, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1160 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 1.2722247838974, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 1170 + }, + { + "epoch": 0.8654198753208654, + "grad_norm": 0.34142059087753296, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 1180 + }, + { + "epoch": 0.8727539420608728, + "grad_norm": 0.3805823028087616, + "learning_rate": 0.0002, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.3931232690811157, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 1200 + }, + { + "epoch": 0.8874220755408874, + "grad_norm": 0.2937372624874115, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 1210 + }, + { + "epoch": 0.8947561422808947, + "grad_norm": 0.3757196366786957, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 1220 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.3502705991268158, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 1230 + }, + { + "epoch": 0.9094242757609095, + "grad_norm": 0.32758915424346924, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 1240 + }, + { + "epoch": 0.9167583425009168, + "grad_norm": 0.37199416756629944, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 1250 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.3551490604877472, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 1260 + }, + { + "epoch": 0.9314264759809314, + "grad_norm": 0.2859550714492798, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 1270 + }, + { + "epoch": 0.9387605427209388, + "grad_norm": 0.427990585565567, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 1280 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.33717992901802063, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 1290 + }, + { + "epoch": 0.9534286762009534, + "grad_norm": 0.30225634574890137, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 1300 + }, + { + "epoch": 0.9607627429409608, + "grad_norm": 0.385821133852005, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 1310 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.9754308764209755, + "grad_norm": 0.49987098574638367, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 1330 + }, + { + "epoch": 0.9827649431609827, + "grad_norm": 0.3842747211456299, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 1340 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.6274653673171997, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1350 + }, + { + "epoch": 0.9974330766409975, + "grad_norm": 0.5239808559417725, + "learning_rate": 0.0002, + "loss": 1.124, + "step": 1360 + }, + { + "epoch": 0.9996332966629996, + "eval_loss": 1.1822267770767212, + "eval_runtime": 32.7389, + "eval_samples_per_second": 13.165, + "eval_steps_per_second": 1.649, + "step": 1363 + }, + { + "epoch": 1.0047671433810048, + "grad_norm": 0.45311301946640015, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 1370 + }, + { + "epoch": 1.012101210121012, + "grad_norm": 0.29685574769973755, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1380 + }, + { + "epoch": 1.0194352768610195, + "grad_norm": 0.3290937840938568, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 1390 + }, + { + "epoch": 1.0267693436010268, + "grad_norm": 0.3801758587360382, + "learning_rate": 0.0002, + "loss": 1.0295, + "step": 1400 + }, + { + "epoch": 1.034103410341034, + "grad_norm": 0.794174313545227, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 1410 + }, + { + "epoch": 1.0414374770810415, + "grad_norm": 0.3854154646396637, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 1420 + }, + { + "epoch": 1.0487715438210488, + "grad_norm": 0.32702451944351196, + "learning_rate": 0.0002, + "loss": 1.0652, + "step": 1430 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 0.7815203666687012, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 1440 + }, + { + "epoch": 1.0634396773010635, + "grad_norm": 0.3087436854839325, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1450 + }, + { + "epoch": 1.0707737440410707, + "grad_norm": 0.3847602903842926, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 1460 + }, + { + "epoch": 1.0781078107810782, + "grad_norm": 0.3693031370639801, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1470 + }, + { + "epoch": 1.0854418775210855, + "grad_norm": 0.4111202359199524, + "learning_rate": 0.0002, + "loss": 1.0995, + "step": 1480 + }, + { + "epoch": 1.0927759442610927, + "grad_norm": 0.41452381014823914, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 1490 + }, + { + "epoch": 1.1001100110011002, + "grad_norm": 0.3336445093154907, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 1500 + }, + { + "epoch": 1.1074440777411074, + "grad_norm": 0.3923407793045044, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 1510 + }, + { + "epoch": 1.1147781444811147, + "grad_norm": 0.46215683221817017, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 1520 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.3592156767845154, + "learning_rate": 0.0002, + "loss": 1.1133, + "step": 1530 + }, + { + "epoch": 1.1294462779611294, + "grad_norm": 0.361110657453537, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 1540 + }, + { + "epoch": 1.1367803447011369, + "grad_norm": 0.5317131280899048, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 1550 + }, + { + "epoch": 1.1441144114411441, + "grad_norm": 0.3882388174533844, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 1560 + }, + { + "epoch": 1.1514484781811514, + "grad_norm": 0.3259428143501282, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 1570 + }, + { + "epoch": 1.1587825449211588, + "grad_norm": 0.410935640335083, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 1580 + }, + { + "epoch": 1.166116611661166, + "grad_norm": 0.44940185546875, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 1590 + }, + { + "epoch": 1.1734506784011733, + "grad_norm": 0.5106484293937683, + "learning_rate": 0.0002, + "loss": 1.0334, + "step": 1600 + }, + { + "epoch": 1.1807847451411808, + "grad_norm": 0.6603665947914124, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 1610 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.4799964129924774, + "learning_rate": 0.0002, + "loss": 1.1227, + "step": 1620 + }, + { + "epoch": 1.1954528786211955, + "grad_norm": 0.4389883279800415, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 1630 + }, + { + "epoch": 1.2027869453612028, + "grad_norm": 0.4188813269138336, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 1640 + }, + { + "epoch": 1.21012101210121, + "grad_norm": 0.7132157683372498, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 1650 + }, + { + "epoch": 1.2174550788412175, + "grad_norm": 0.507480263710022, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1660 + }, + { + "epoch": 1.2247891455812248, + "grad_norm": 0.9452332854270935, + "learning_rate": 0.0002, + "loss": 0.9948, + "step": 1670 + }, + { + "epoch": 1.2321232123212322, + "grad_norm": 0.4121614992618561, + "learning_rate": 0.0002, + "loss": 1.0228, + "step": 1680 + }, + { + "epoch": 1.2394572790612395, + "grad_norm": 0.34230247139930725, + "learning_rate": 0.0002, + "loss": 1.0366, + "step": 1690 + }, + { + "epoch": 1.2467913458012467, + "grad_norm": 0.4026208817958832, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 1700 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.46673697233200073, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1710 + }, + { + "epoch": 1.2614594792812615, + "grad_norm": 0.38349825143814087, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 1720 + }, + { + "epoch": 1.2687935460212687, + "grad_norm": 0.4049997627735138, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 1730 + }, + { + "epoch": 1.2761276127612762, + "grad_norm": 0.3417615294456482, + "learning_rate": 0.0002, + "loss": 0.9504, + "step": 1740 + }, + { + "epoch": 1.2834616795012834, + "grad_norm": 0.4277614951133728, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 1750 + }, + { + "epoch": 1.2907957462412907, + "grad_norm": 0.5864202976226807, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 1760 + }, + { + "epoch": 1.2981298129812981, + "grad_norm": 0.7097493410110474, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 1770 + }, + { + "epoch": 1.3054638797213054, + "grad_norm": 0.3145381212234497, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 1780 + }, + { + "epoch": 1.3127979464613129, + "grad_norm": 0.5116165280342102, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 1790 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 0.7469736337661743, + "learning_rate": 0.0002, + "loss": 1.0765, + "step": 1800 + }, + { + "epoch": 1.3274660799413276, + "grad_norm": 0.32272255420684814, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 1810 + }, + { + "epoch": 1.3348001466813348, + "grad_norm": 0.3534623086452484, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 1820 + }, + { + "epoch": 1.342134213421342, + "grad_norm": 0.36127907037734985, + "learning_rate": 0.0002, + "loss": 1.1628, + "step": 1830 + }, + { + "epoch": 1.3494682801613496, + "grad_norm": 0.4072401523590088, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 1840 + }, + { + "epoch": 1.3568023469013568, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 1850 + }, + { + "epoch": 1.364136413641364, + "grad_norm": 0.412883460521698, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 1860 + }, + { + "epoch": 1.3714704803813715, + "grad_norm": 0.3735875189304352, + "learning_rate": 0.0002, + "loss": 1.0265, + "step": 1870 + }, + { + "epoch": 1.3788045471213788, + "grad_norm": 0.39158159494400024, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 1880 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.44431769847869873, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 1890 + }, + { + "epoch": 1.3934726806013935, + "grad_norm": 0.37772801518440247, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 1900 + }, + { + "epoch": 1.4008067473414008, + "grad_norm": 0.4056641757488251, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 1910 + }, + { + "epoch": 1.408140814081408, + "grad_norm": 0.41612377762794495, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 1920 + }, + { + "epoch": 1.4154748808214155, + "grad_norm": 0.41153013706207275, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 1930 + }, + { + "epoch": 1.4228089475614227, + "grad_norm": 0.387845516204834, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1940 + }, + { + "epoch": 1.4301430143014302, + "grad_norm": 0.3809587061405182, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 1950 + }, + { + "epoch": 1.4374770810414375, + "grad_norm": 0.3625726103782654, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1960 + }, + { + "epoch": 1.444811147781445, + "grad_norm": 0.5294290781021118, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1970 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 0.39975494146347046, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 1980 + }, + { + "epoch": 1.4594792812614594, + "grad_norm": 0.4181167185306549, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 1990 + }, + { + "epoch": 1.466813348001467, + "grad_norm": 0.42001503705978394, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 2000 + }, + { + "epoch": 1.4741474147414741, + "grad_norm": 0.4877578616142273, + "learning_rate": 0.0002, + "loss": 1.1266, + "step": 2010 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.4050969183444977, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 2020 + }, + { + "epoch": 1.4888155482214889, + "grad_norm": 0.39068883657455444, + "learning_rate": 0.0002, + "loss": 1.0562, + "step": 2030 + }, + { + "epoch": 1.4961496149614961, + "grad_norm": 0.421282559633255, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 2040 + }, + { + "epoch": 1.5034836817015034, + "grad_norm": 0.47092297673225403, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 2050 + }, + { + "epoch": 1.5108177484415108, + "grad_norm": 0.39688974618911743, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 2060 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.5529879331588745, + "learning_rate": 0.0002, + "loss": 1.08, + "step": 2070 + }, + { + "epoch": 1.5254858819215253, + "grad_norm": 0.4879782199859619, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 2080 + }, + { + "epoch": 1.5328199486615328, + "grad_norm": 0.5517361164093018, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 2090 + }, + { + "epoch": 1.5401540154015403, + "grad_norm": 0.44015637040138245, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2100 + }, + { + "epoch": 1.5474880821415475, + "grad_norm": 0.5435167551040649, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 2110 + }, + { + "epoch": 1.5548221488815548, + "grad_norm": 0.5714033246040344, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 2120 + }, + { + "epoch": 1.5621562156215623, + "grad_norm": 0.31732529401779175, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 2130 + }, + { + "epoch": 1.5694902823615695, + "grad_norm": 0.49068278074264526, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 2140 + }, + { + "epoch": 1.5768243491015768, + "grad_norm": 0.46851542592048645, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 2150 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.5083092451095581, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 2160 + }, + { + "epoch": 1.5914924825815915, + "grad_norm": 0.9822936058044434, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 2170 + }, + { + "epoch": 1.5988265493215987, + "grad_norm": 0.4575989246368408, + "learning_rate": 0.0002, + "loss": 0.9986, + "step": 2180 + }, + { + "epoch": 1.6061606160616062, + "grad_norm": 0.47444286942481995, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 2190 + }, + { + "epoch": 1.6134946828016135, + "grad_norm": 0.7208226919174194, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 2200 + }, + { + "epoch": 1.6208287495416207, + "grad_norm": 0.43791481852531433, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 2210 + }, + { + "epoch": 1.6281628162816282, + "grad_norm": 0.5245792865753174, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 2220 + }, + { + "epoch": 1.6354968830216357, + "grad_norm": 0.39289429783821106, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 2230 + }, + { + "epoch": 1.6428309497616427, + "grad_norm": 0.6106135845184326, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 2240 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.3722580671310425, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 2250 + }, + { + "epoch": 1.6574990832416576, + "grad_norm": 0.3649403750896454, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2260 + }, + { + "epoch": 1.6648331499816649, + "grad_norm": 0.46514248847961426, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 2270 + }, + { + "epoch": 1.6721672167216721, + "grad_norm": 0.42034927010536194, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 2280 + }, + { + "epoch": 1.6795012834616796, + "grad_norm": 0.45202910900115967, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 2290 + }, + { + "epoch": 1.6868353502016868, + "grad_norm": 0.36257603764533997, + "learning_rate": 0.0002, + "loss": 1.0866, + "step": 2300 + }, + { + "epoch": 1.694169416941694, + "grad_norm": 0.6340323090553284, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 2310 + }, + { + "epoch": 1.7015034836817016, + "grad_norm": 0.4352878928184509, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 2320 + }, + { + "epoch": 1.7088375504217088, + "grad_norm": 0.45029792189598083, + "learning_rate": 0.0002, + "loss": 1.0629, + "step": 2330 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.3891315758228302, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 2340 + }, + { + "epoch": 1.7235056839017235, + "grad_norm": 0.35180050134658813, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2350 + }, + { + "epoch": 1.7308397506417308, + "grad_norm": 0.42367449402809143, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 2360 + }, + { + "epoch": 1.738173817381738, + "grad_norm": 0.4553675353527069, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 2370 + }, + { + "epoch": 1.7455078841217455, + "grad_norm": 0.5944654941558838, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 2380 + }, + { + "epoch": 1.752841950861753, + "grad_norm": 0.3479664623737335, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 2390 + }, + { + "epoch": 1.76017601760176, + "grad_norm": 0.3585502505302429, + "learning_rate": 0.0002, + "loss": 1.0798, + "step": 2400 + }, + { + "epoch": 1.7675100843417675, + "grad_norm": 0.4263346493244171, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 2410 + }, + { + "epoch": 1.774844151081775, + "grad_norm": 0.5476409196853638, + "learning_rate": 0.0002, + "loss": 1.054, + "step": 2420 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 0.3694186508655548, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 2430 + }, + { + "epoch": 1.7895122845617895, + "grad_norm": 0.9185658693313599, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 2440 + }, + { + "epoch": 1.796846351301797, + "grad_norm": 0.7171908020973206, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2450 + }, + { + "epoch": 1.8041804180418042, + "grad_norm": 0.550658643245697, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 2460 + }, + { + "epoch": 1.8115144847818114, + "grad_norm": 0.4075568914413452, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2470 + }, + { + "epoch": 1.818848551521819, + "grad_norm": 0.3790127635002136, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 2480 + }, + { + "epoch": 1.8261826182618262, + "grad_norm": 0.3576384484767914, + "learning_rate": 0.0002, + "loss": 0.9839, + "step": 2490 + }, + { + "epoch": 1.8335166850018334, + "grad_norm": 0.3919370770454407, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 2500 + }, + { + "epoch": 1.8408507517418409, + "grad_norm": 0.485083669424057, + "learning_rate": 0.0002, + "loss": 0.9985, + "step": 2510 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.4564347565174103, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 2520 + }, + { + "epoch": 1.8555188852218554, + "grad_norm": 0.3613106608390808, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 2530 + }, + { + "epoch": 1.8628529519618628, + "grad_norm": 0.39600759744644165, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 2540 + }, + { + "epoch": 1.8701870187018703, + "grad_norm": 1.123499870300293, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 2550 + }, + { + "epoch": 1.8775210854418776, + "grad_norm": 0.4612680673599243, + "learning_rate": 0.0002, + "loss": 1.0635, + "step": 2560 + }, + { + "epoch": 1.8848551521818848, + "grad_norm": 0.42745399475097656, + "learning_rate": 0.0002, + "loss": 1.0087, + "step": 2570 + }, + { + "epoch": 1.8921892189218923, + "grad_norm": 0.4055580198764801, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 2580 + }, + { + "epoch": 1.8995232856618995, + "grad_norm": 0.44174644351005554, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 2590 + }, + { + "epoch": 1.9068573524019068, + "grad_norm": 1.0228385925292969, + "learning_rate": 0.0002, + "loss": 0.9886, + "step": 2600 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.3496396243572235, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 2610 + }, + { + "epoch": 1.9215254858819215, + "grad_norm": 0.4191173017024994, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2620 + }, + { + "epoch": 1.9288595526219288, + "grad_norm": 0.6778554916381836, + "learning_rate": 0.0002, + "loss": 1.0943, + "step": 2630 + }, + { + "epoch": 1.9361936193619362, + "grad_norm": 0.41992834210395813, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 2640 + }, + { + "epoch": 1.9435276861019435, + "grad_norm": 0.8760401010513306, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 2650 + }, + { + "epoch": 1.9508617528419507, + "grad_norm": 0.44049209356307983, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 2660 + }, + { + "epoch": 1.9581958195819582, + "grad_norm": 0.5651928782463074, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 2670 + }, + { + "epoch": 1.9655298863219657, + "grad_norm": 0.5292727947235107, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 2680 + }, + { + "epoch": 1.9728639530619727, + "grad_norm": 0.6012240648269653, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 2690 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.3945149779319763, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 2700 + }, + { + "epoch": 1.9875320865419877, + "grad_norm": 0.5732627511024475, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 2710 + }, + { + "epoch": 1.994866153281995, + "grad_norm": 0.3963361084461212, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 2720 + }, + { + "epoch": 2.0, + "eval_loss": 1.1534006595611572, + "eval_runtime": 32.7541, + "eval_samples_per_second": 13.159, + "eval_steps_per_second": 1.649, + "step": 2727 + }, + { + "epoch": 2.002200220022002, + "grad_norm": 0.48628315329551697, + "learning_rate": 0.0002, + "loss": 0.9624, + "step": 2730 + }, + { + "epoch": 2.0095342867620096, + "grad_norm": 0.413875013589859, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 2740 + }, + { + "epoch": 2.0168683535020167, + "grad_norm": 0.4988735616207123, + "learning_rate": 0.0002, + "loss": 0.965, + "step": 2750 + }, + { + "epoch": 2.024202420242024, + "grad_norm": 0.5634812712669373, + "learning_rate": 0.0002, + "loss": 0.9677, + "step": 2760 + }, + { + "epoch": 2.0315364869820316, + "grad_norm": 0.48302653431892395, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 2770 + }, + { + "epoch": 2.038870553722039, + "grad_norm": 0.49914175271987915, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 2780 + }, + { + "epoch": 2.046204620462046, + "grad_norm": 1.14039945602417, + "learning_rate": 0.0002, + "loss": 0.904, + "step": 2790 + }, + { + "epoch": 2.0535386872020536, + "grad_norm": 0.6359720826148987, + "learning_rate": 0.0002, + "loss": 0.9588, + "step": 2800 + }, + { + "epoch": 2.060872753942061, + "grad_norm": 0.4589158296585083, + "learning_rate": 0.0002, + "loss": 0.9031, + "step": 2810 + }, + { + "epoch": 2.068206820682068, + "grad_norm": 0.46255481243133545, + "learning_rate": 0.0002, + "loss": 0.9438, + "step": 2820 + }, + { + "epoch": 2.0755408874220755, + "grad_norm": 0.6232137680053711, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 2830 + }, + { + "epoch": 2.082874954162083, + "grad_norm": 0.41042178869247437, + "learning_rate": 0.0002, + "loss": 0.8978, + "step": 2840 + }, + { + "epoch": 2.09020902090209, + "grad_norm": 0.5334428548812866, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 2850 + }, + { + "epoch": 2.0975430876420975, + "grad_norm": 0.8270058631896973, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 2860 + }, + { + "epoch": 2.104877154382105, + "grad_norm": 0.6624533534049988, + "learning_rate": 0.0002, + "loss": 1.0064, + "step": 2870 + }, + { + "epoch": 2.112211221122112, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0002, + "loss": 0.9196, + "step": 2880 + }, + { + "epoch": 2.1195452878621195, + "grad_norm": 0.621482789516449, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 2890 + }, + { + "epoch": 2.126879354602127, + "grad_norm": 0.4556255340576172, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2900 + }, + { + "epoch": 2.1342134213421344, + "grad_norm": 0.4620579183101654, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 2910 + }, + { + "epoch": 2.1415474880821415, + "grad_norm": 0.9602415561676025, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2920 + }, + { + "epoch": 2.148881554822149, + "grad_norm": 0.587943971157074, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 2930 + }, + { + "epoch": 2.1562156215621564, + "grad_norm": 0.5121372938156128, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 2940 + }, + { + "epoch": 2.1635496883021634, + "grad_norm": 0.49424484372138977, + "learning_rate": 0.0002, + "loss": 0.8751, + "step": 2950 + }, + { + "epoch": 2.170883755042171, + "grad_norm": 0.6312560439109802, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2960 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.5235576629638672, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2970 + }, + { + "epoch": 2.1855518885221854, + "grad_norm": 0.5868439674377441, + "learning_rate": 0.0002, + "loss": 0.9706, + "step": 2980 + }, + { + "epoch": 2.192885955262193, + "grad_norm": 0.42302873730659485, + "learning_rate": 0.0002, + "loss": 0.9338, + "step": 2990 + }, + { + "epoch": 2.2002200220022003, + "grad_norm": 0.5097725987434387, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 3000 + }, + { + "epoch": 2.2075540887422074, + "grad_norm": 0.5091572403907776, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 3010 + }, + { + "epoch": 2.214888155482215, + "grad_norm": 0.49433162808418274, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 3020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5577368140220642, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 3030 + }, + { + "epoch": 2.2295562889622293, + "grad_norm": 0.6177583932876587, + "learning_rate": 0.0002, + "loss": 0.9033, + "step": 3040 + }, + { + "epoch": 2.236890355702237, + "grad_norm": 0.5256719589233398, + "learning_rate": 0.0002, + "loss": 0.9882, + "step": 3050 + }, + { + "epoch": 2.2442244224422443, + "grad_norm": 0.5001118183135986, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3060 + }, + { + "epoch": 2.2515584891822513, + "grad_norm": 0.5721249580383301, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3070 + }, + { + "epoch": 2.258892555922259, + "grad_norm": 0.5325384140014648, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 3080 + }, + { + "epoch": 2.2662266226622663, + "grad_norm": 0.5719189047813416, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 3090 + }, + { + "epoch": 2.2735606894022737, + "grad_norm": 0.6337835788726807, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 3100 + }, + { + "epoch": 2.2808947561422808, + "grad_norm": 0.5381836891174316, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 3110 + }, + { + "epoch": 2.2882288228822882, + "grad_norm": 0.5408531427383423, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 3120 + }, + { + "epoch": 2.2955628896222957, + "grad_norm": 0.43705281615257263, + "learning_rate": 0.0002, + "loss": 1.0325, + "step": 3130 + }, + { + "epoch": 2.3028969563623027, + "grad_norm": 0.6454030275344849, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 3140 + }, + { + "epoch": 2.31023102310231, + "grad_norm": 0.686030387878418, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 3150 + }, + { + "epoch": 2.3175650898423177, + "grad_norm": 0.5123633146286011, + "learning_rate": 0.0002, + "loss": 0.9403, + "step": 3160 + }, + { + "epoch": 2.3248991565823247, + "grad_norm": 0.842506468296051, + "learning_rate": 0.0002, + "loss": 0.8834, + "step": 3170 + }, + { + "epoch": 2.332233223322332, + "grad_norm": 0.5193818807601929, + "learning_rate": 0.0002, + "loss": 1.0497, + "step": 3180 + }, + { + "epoch": 2.3395672900623397, + "grad_norm": 0.5634409189224243, + "learning_rate": 0.0002, + "loss": 0.9473, + "step": 3190 + }, + { + "epoch": 2.3469013568023467, + "grad_norm": 0.6475534439086914, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 3200 + }, + { + "epoch": 2.354235423542354, + "grad_norm": 1.1503914594650269, + "learning_rate": 0.0002, + "loss": 0.874, + "step": 3210 + }, + { + "epoch": 2.3615694902823616, + "grad_norm": 0.7234905362129211, + "learning_rate": 0.0002, + "loss": 0.9762, + "step": 3220 + }, + { + "epoch": 2.368903557022369, + "grad_norm": 0.664903461933136, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 3230 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.5453006625175476, + "learning_rate": 0.0002, + "loss": 0.9987, + "step": 3240 + }, + { + "epoch": 2.3835716905023836, + "grad_norm": 0.6256654262542725, + "learning_rate": 0.0002, + "loss": 0.9742, + "step": 3250 + }, + { + "epoch": 2.390905757242391, + "grad_norm": 0.5166565179824829, + "learning_rate": 0.0002, + "loss": 0.9922, + "step": 3260 + }, + { + "epoch": 2.398239823982398, + "grad_norm": 0.5699098110198975, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 3270 + }, + { + "epoch": 2.4055738907224056, + "grad_norm": 0.4472540020942688, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 3280 + }, + { + "epoch": 2.412907957462413, + "grad_norm": 0.6790403127670288, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3290 + }, + { + "epoch": 2.42024202420242, + "grad_norm": 0.5182185173034668, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 3300 + }, + { + "epoch": 2.4275760909424275, + "grad_norm": 0.564647912979126, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 3310 + }, + { + "epoch": 2.434910157682435, + "grad_norm": 0.5625313520431519, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 3320 + }, + { + "epoch": 2.442244224422442, + "grad_norm": 0.7496559619903564, + "learning_rate": 0.0002, + "loss": 0.8798, + "step": 3330 + }, + { + "epoch": 2.4495782911624495, + "grad_norm": 0.4779128134250641, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 2.456912357902457, + "grad_norm": 0.578093409538269, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 3350 + }, + { + "epoch": 2.4642464246424645, + "grad_norm": 0.5456080436706543, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 3360 + }, + { + "epoch": 2.4715804913824715, + "grad_norm": 0.4769273102283478, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 3370 + }, + { + "epoch": 2.478914558122479, + "grad_norm": 0.5608189702033997, + "learning_rate": 0.0002, + "loss": 0.9312, + "step": 3380 + }, + { + "epoch": 2.4862486248624864, + "grad_norm": 0.5590165853500366, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 3390 + }, + { + "epoch": 2.4935826916024935, + "grad_norm": 0.801306962966919, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 3400 + }, + { + "epoch": 2.500916758342501, + "grad_norm": 0.6045624613761902, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 3410 + }, + { + "epoch": 2.5082508250825084, + "grad_norm": 0.5735858082771301, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 3420 + }, + { + "epoch": 2.5155848918225154, + "grad_norm": 0.6827309131622314, + "learning_rate": 0.0002, + "loss": 0.9846, + "step": 3430 + }, + { + "epoch": 2.522918958562523, + "grad_norm": 0.5702602863311768, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 3440 + }, + { + "epoch": 2.5302530253025304, + "grad_norm": 0.6674721240997314, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 3450 + }, + { + "epoch": 2.5375870920425374, + "grad_norm": 0.5635907649993896, + "learning_rate": 0.0002, + "loss": 0.914, + "step": 3460 + }, + { + "epoch": 2.544921158782545, + "grad_norm": 0.42737770080566406, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 3470 + }, + { + "epoch": 2.5522552255225524, + "grad_norm": 0.6720691919326782, + "learning_rate": 0.0002, + "loss": 0.9474, + "step": 3480 + }, + { + "epoch": 2.55958929226256, + "grad_norm": 0.8917084336280823, + "learning_rate": 0.0002, + "loss": 0.8637, + "step": 3490 + }, + { + "epoch": 2.566923359002567, + "grad_norm": 0.5134549140930176, + "learning_rate": 0.0002, + "loss": 0.9257, + "step": 3500 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.4951367974281311, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 3510 + }, + { + "epoch": 2.5815914924825814, + "grad_norm": 0.9438204765319824, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 3520 + }, + { + "epoch": 2.588925559222589, + "grad_norm": 0.6024714708328247, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 3530 + }, + { + "epoch": 2.5962596259625963, + "grad_norm": 0.5248535871505737, + "learning_rate": 0.0002, + "loss": 0.9298, + "step": 3540 + }, + { + "epoch": 2.6035936927026038, + "grad_norm": 0.8677568435668945, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 3550 + }, + { + "epoch": 2.610927759442611, + "grad_norm": 0.82008296251297, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 3560 + }, + { + "epoch": 2.6182618261826183, + "grad_norm": 0.4724634885787964, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 3570 + }, + { + "epoch": 2.6255958929226257, + "grad_norm": 0.5434244275093079, + "learning_rate": 0.0002, + "loss": 0.9058, + "step": 3580 + }, + { + "epoch": 2.6329299596626328, + "grad_norm": 0.4948740005493164, + "learning_rate": 0.0002, + "loss": 0.9379, + "step": 3590 + }, + { + "epoch": 2.6402640264026402, + "grad_norm": 0.42109328508377075, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3600 + }, + { + "epoch": 2.6475980931426477, + "grad_norm": 0.7979786396026611, + "learning_rate": 0.0002, + "loss": 0.9809, + "step": 3610 + }, + { + "epoch": 2.654932159882655, + "grad_norm": 0.6345919370651245, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 3620 + }, + { + "epoch": 2.662266226622662, + "grad_norm": 0.4971671402454376, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 3630 + }, + { + "epoch": 2.6696002933626697, + "grad_norm": 0.6467748284339905, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 3640 + }, + { + "epoch": 2.6769343601026767, + "grad_norm": 0.4240160286426544, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 3650 + }, + { + "epoch": 2.684268426842684, + "grad_norm": 0.5179754495620728, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3660 + }, + { + "epoch": 2.6916024935826917, + "grad_norm": 0.754012405872345, + "learning_rate": 0.0002, + "loss": 0.9221, + "step": 3670 + }, + { + "epoch": 2.698936560322699, + "grad_norm": 0.5141299962997437, + "learning_rate": 0.0002, + "loss": 0.9194, + "step": 3680 + }, + { + "epoch": 2.706270627062706, + "grad_norm": 0.5737819075584412, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 3690 + }, + { + "epoch": 2.7136046938027136, + "grad_norm": 0.5887577533721924, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 3700 + }, + { + "epoch": 2.720938760542721, + "grad_norm": 0.6740471720695496, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 3710 + }, + { + "epoch": 2.728272827282728, + "grad_norm": 0.5879453420639038, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 3720 + }, + { + "epoch": 2.7356068940227356, + "grad_norm": 0.4858354926109314, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 3730 + }, + { + "epoch": 2.742940960762743, + "grad_norm": 0.5489001870155334, + "learning_rate": 0.0002, + "loss": 0.9308, + "step": 3740 + }, + { + "epoch": 2.7502750275027505, + "grad_norm": 0.8187092542648315, + "learning_rate": 0.0002, + "loss": 0.894, + "step": 3750 + }, + { + "epoch": 2.7576090942427576, + "grad_norm": 0.5666626691818237, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 3760 + }, + { + "epoch": 2.764943160982765, + "grad_norm": 0.5377066135406494, + "learning_rate": 0.0002, + "loss": 1.0059, + "step": 3770 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.566330075263977, + "learning_rate": 0.0002, + "loss": 0.9132, + "step": 3780 + }, + { + "epoch": 2.7796112944627795, + "grad_norm": 0.5522832870483398, + "learning_rate": 0.0002, + "loss": 0.9415, + "step": 3790 + }, + { + "epoch": 2.786945361202787, + "grad_norm": 0.5668695569038391, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 3800 + }, + { + "epoch": 2.7942794279427945, + "grad_norm": 0.7566602826118469, + "learning_rate": 0.0002, + "loss": 0.8885, + "step": 3810 + }, + { + "epoch": 2.8016134946828015, + "grad_norm": 0.5603684782981873, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 3820 + }, + { + "epoch": 2.808947561422809, + "grad_norm": 0.49122217297554016, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 3830 + }, + { + "epoch": 2.816281628162816, + "grad_norm": 0.6798251867294312, + "learning_rate": 0.0002, + "loss": 0.9738, + "step": 3840 + }, + { + "epoch": 2.8236156949028235, + "grad_norm": 0.6097991466522217, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 3850 + }, + { + "epoch": 2.830949761642831, + "grad_norm": 0.6675726175308228, + "learning_rate": 0.0002, + "loss": 0.8672, + "step": 3860 + }, + { + "epoch": 2.8382838283828384, + "grad_norm": 0.9223952889442444, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 3870 + }, + { + "epoch": 2.8456178951228455, + "grad_norm": 0.6020799875259399, + "learning_rate": 0.0002, + "loss": 0.8767, + "step": 3880 + }, + { + "epoch": 2.852951961862853, + "grad_norm": 0.5206381678581238, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3890 + }, + { + "epoch": 2.8602860286028604, + "grad_norm": 0.6268777251243591, + "learning_rate": 0.0002, + "loss": 0.9479, + "step": 3900 + }, + { + "epoch": 2.8676200953428674, + "grad_norm": 1.1583497524261475, + "learning_rate": 0.0002, + "loss": 0.9409, + "step": 3910 + }, + { + "epoch": 2.874954162082875, + "grad_norm": 0.7263903021812439, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 3920 + }, + { + "epoch": 2.8822882288228824, + "grad_norm": 0.5369910001754761, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 3930 + }, + { + "epoch": 2.88962229556289, + "grad_norm": 0.7298350930213928, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 3940 + }, + { + "epoch": 2.896956362302897, + "grad_norm": 0.577012836933136, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 3950 + }, + { + "epoch": 2.9042904290429044, + "grad_norm": 0.5859594345092773, + "learning_rate": 0.0002, + "loss": 0.9716, + "step": 3960 + }, + { + "epoch": 2.9116244957829114, + "grad_norm": 0.47176122665405273, + "learning_rate": 0.0002, + "loss": 0.8772, + "step": 3970 + }, + { + "epoch": 2.918958562522919, + "grad_norm": 0.9699620604515076, + "learning_rate": 0.0002, + "loss": 0.8997, + "step": 3980 + }, + { + "epoch": 2.9262926292629263, + "grad_norm": 0.7908747792243958, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3990 + }, + { + "epoch": 2.933626696002934, + "grad_norm": 0.5777379274368286, + "learning_rate": 0.0002, + "loss": 0.9462, + "step": 4000 + }, + { + "epoch": 2.940960762742941, + "grad_norm": 0.599288284778595, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 4010 + }, + { + "epoch": 2.9482948294829483, + "grad_norm": 0.5232274532318115, + "learning_rate": 0.0002, + "loss": 0.9812, + "step": 4020 + }, + { + "epoch": 2.9556288962229558, + "grad_norm": 0.6395137310028076, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 4030 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.589260458946228, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 4040 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.5699581503868103, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 4050 + }, + { + "epoch": 2.9776310964429777, + "grad_norm": 0.528468132019043, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 4060 + }, + { + "epoch": 2.984965163182985, + "grad_norm": 0.4804670512676239, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 4070 + }, + { + "epoch": 2.9922992299229922, + "grad_norm": 1.1918889284133911, + "learning_rate": 0.0002, + "loss": 0.9771, + "step": 4080 + }, + { + "epoch": 2.9996332966629997, + "grad_norm": 0.5479103326797485, + "learning_rate": 0.0002, + "loss": 0.9178, + "step": 4090 + }, + { + "epoch": 2.9996332966629997, + "eval_loss": 1.1642853021621704, + "eval_runtime": 32.7511, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.649, + "step": 4090 + } + ], + "logging_steps": 10, + "max_steps": 10904, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8929918099954074e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-4090/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e2f27193d2800afa1ecf38d32ca029cca4c48153 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:783240254d4d99f2734879322ca67b130ada972da1875179a88b93095caa5027 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7a7b012ba82f504ba58834e0666e367cecb43d3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aac35d75c36594bce550fceee036251df5ff665be30b898e7461e6cf74d4738b +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f30acc8ccb3de502804392f9179ca54bf8fbaf6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3267b86fe975e9097bd132b8cbf8c3b6f2dc70ad372771b130a4b0a82c1b248d +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..85e5a20c021880a6fb5c9efb5621de4c963b391f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e6b8312521b90d16ac6a6a64f9ca779b0501086d106cb4f9b043c7499d67c8 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c251e2b2b002f82fe3e94a1e003a0f7cabe5e71d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/trainer_state.json @@ -0,0 +1,3880 @@ +{ + "best_metric": 1.1534006595611572, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 5454, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007334066740007334, + "grad_norm": 0.47521963715553284, + "learning_rate": 0.0002, + "loss": 1.9722, + "step": 10 + }, + { + "epoch": 0.014668133480014669, + "grad_norm": 0.5395162105560303, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 20 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 0.4305780231952667, + "learning_rate": 0.0002, + "loss": 1.4202, + "step": 30 + }, + { + "epoch": 0.029336266960029337, + "grad_norm": 0.6938246488571167, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 40 + }, + { + "epoch": 0.03667033370003667, + "grad_norm": 1.5133819580078125, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 50 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 0.9173883199691772, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 60 + }, + { + "epoch": 0.05133846718005134, + "grad_norm": 0.4619861841201782, + "learning_rate": 0.0002, + "loss": 1.2844, + "step": 70 + }, + { + "epoch": 0.058672533920058674, + "grad_norm": 0.46118637919425964, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 80 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 0.4468648135662079, + "learning_rate": 0.0002, + "loss": 1.3441, + "step": 90 + }, + { + "epoch": 0.07334066740007333, + "grad_norm": 0.46123769879341125, + "learning_rate": 0.0002, + "loss": 1.1863, + "step": 100 + }, + { + "epoch": 0.08067473414008068, + "grad_norm": 0.4859139025211334, + "learning_rate": 0.0002, + "loss": 1.2772, + "step": 110 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 0.4384922385215759, + "learning_rate": 0.0002, + "loss": 1.2087, + "step": 120 + }, + { + "epoch": 0.09534286762009535, + "grad_norm": 0.39519360661506653, + "learning_rate": 0.0002, + "loss": 1.2927, + "step": 130 + }, + { + "epoch": 0.10267693436010268, + "grad_norm": 0.4049859344959259, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 140 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 0.4605638086795807, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 150 + }, + { + "epoch": 0.11734506784011735, + "grad_norm": 0.4201928377151489, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 160 + }, + { + "epoch": 0.12467913458012468, + "grad_norm": 0.5367777347564697, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 170 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 0.41752299666404724, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 180 + }, + { + "epoch": 0.13934726806013933, + "grad_norm": 0.31597763299942017, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 190 + }, + { + "epoch": 0.14668133480014667, + "grad_norm": 0.7468788623809814, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 200 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 0.3403034508228302, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 210 + }, + { + "epoch": 0.16134946828016136, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.1686835350201687, + "grad_norm": 0.356158971786499, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 230 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 0.3448857367038727, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 240 + }, + { + "epoch": 0.18335166850018336, + "grad_norm": 0.3475699722766876, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 250 + }, + { + "epoch": 0.1906857352401907, + "grad_norm": 0.2770358622074127, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 260 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.4310270845890045, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 270 + }, + { + "epoch": 0.20535386872020536, + "grad_norm": 0.335041880607605, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 280 + }, + { + "epoch": 0.2126879354602127, + "grad_norm": 0.3420602083206177, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 290 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 0.325001060962677, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 300 + }, + { + "epoch": 0.22735606894022736, + "grad_norm": 0.3027827739715576, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 310 + }, + { + "epoch": 0.2346901356802347, + "grad_norm": 0.435550719499588, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 320 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 0.3884522616863251, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 330 + }, + { + "epoch": 0.24935826916024936, + "grad_norm": 0.7736002206802368, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 340 + }, + { + "epoch": 0.2566923359002567, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0002, + "loss": 1.3606, + "step": 350 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.3311890959739685, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 360 + }, + { + "epoch": 0.27136046938027136, + "grad_norm": 0.7473500370979309, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 370 + }, + { + "epoch": 0.27869453612027867, + "grad_norm": 0.3681875765323639, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 380 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 0.3764737844467163, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 390 + }, + { + "epoch": 0.29336266960029334, + "grad_norm": 0.4243989586830139, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 400 + }, + { + "epoch": 0.3006967363403007, + "grad_norm": 0.2658531963825226, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 410 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 0.3436793386936188, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.31536486982031536, + "grad_norm": 0.5101129412651062, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 430 + }, + { + "epoch": 0.3226989365603227, + "grad_norm": 0.3319750726222992, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 440 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.385148286819458, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 450 + }, + { + "epoch": 0.3373670700403374, + "grad_norm": 0.3477935791015625, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 460 + }, + { + "epoch": 0.3447011367803447, + "grad_norm": 0.29748716950416565, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 470 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 0.34083324670791626, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 480 + }, + { + "epoch": 0.35936927026035936, + "grad_norm": 0.36904552578926086, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 490 + }, + { + "epoch": 0.3667033370003667, + "grad_norm": 0.315483033657074, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 500 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.44897955656051636, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 510 + }, + { + "epoch": 0.3813714704803814, + "grad_norm": 0.3160701394081116, + "learning_rate": 0.0002, + "loss": 1.3035, + "step": 520 + }, + { + "epoch": 0.3887055372203887, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 530 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5430002808570862, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 540 + }, + { + "epoch": 0.40337367070040336, + "grad_norm": 0.2908070683479309, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 550 + }, + { + "epoch": 0.4107077374404107, + "grad_norm": 0.35066530108451843, + "learning_rate": 0.0002, + "loss": 1.2384, + "step": 560 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.37588003277778625, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 570 + }, + { + "epoch": 0.4253758709204254, + "grad_norm": 0.3112126886844635, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 580 + }, + { + "epoch": 0.4327099376604327, + "grad_norm": 0.35577139258384705, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 590 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.31706422567367554, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 600 + }, + { + "epoch": 0.44737807114044736, + "grad_norm": 0.3249092102050781, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 610 + }, + { + "epoch": 0.4547121378804547, + "grad_norm": 0.3842705488204956, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 620 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.390991747379303, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 630 + }, + { + "epoch": 0.4693802713604694, + "grad_norm": 0.27532413601875305, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 640 + }, + { + "epoch": 0.4767143381004767, + "grad_norm": 0.31412816047668457, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 650 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.32117101550102234, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.49138247158049136, + "grad_norm": 0.3810010254383087, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 670 + }, + { + "epoch": 0.4987165383204987, + "grad_norm": 0.36289164423942566, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 680 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.34458720684051514, + "learning_rate": 0.0002, + "loss": 1.2034, + "step": 690 + }, + { + "epoch": 0.5133846718005134, + "grad_norm": 0.32844600081443787, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 700 + }, + { + "epoch": 0.5207187385405208, + "grad_norm": 0.3144175708293915, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 710 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.3898887634277344, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 720 + }, + { + "epoch": 0.5353868720205354, + "grad_norm": 1.3220758438110352, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 730 + }, + { + "epoch": 0.5427209387605427, + "grad_norm": 0.3635874390602112, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 740 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.3138217628002167, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 750 + }, + { + "epoch": 0.5573890722405573, + "grad_norm": 0.4063207805156708, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.5647231389805647, + "grad_norm": 0.3926219940185547, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 770 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.31954652070999146, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 780 + }, + { + "epoch": 0.5793912724605794, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 790 + }, + { + "epoch": 0.5867253392005867, + "grad_norm": 0.643004834651947, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 800 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.3479592800140381, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 810 + }, + { + "epoch": 0.6013934726806014, + "grad_norm": 0.4684754014015198, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 820 + }, + { + "epoch": 0.6087275394206088, + "grad_norm": 0.3739790916442871, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 830 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.40884748101234436, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 840 + }, + { + "epoch": 0.6233956729006234, + "grad_norm": 0.9722164273262024, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 850 + }, + { + "epoch": 0.6307297396406307, + "grad_norm": 0.42992347478866577, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 860 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.36654195189476013, + "learning_rate": 0.0002, + "loss": 1.1339, + "step": 870 + }, + { + "epoch": 0.6453978731206454, + "grad_norm": 0.4113832116127014, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 880 + }, + { + "epoch": 0.6527319398606527, + "grad_norm": 0.2948838770389557, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 890 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.38330280780792236, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 900 + }, + { + "epoch": 0.6674000733406674, + "grad_norm": 0.4428867697715759, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 910 + }, + { + "epoch": 0.6747341400806748, + "grad_norm": 0.23659265041351318, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 920 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.323685884475708, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 930 + }, + { + "epoch": 0.6894022735606894, + "grad_norm": 0.39157727360725403, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 940 + }, + { + "epoch": 0.6967363403006968, + "grad_norm": 0.27189481258392334, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 950 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.529883861541748, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 960 + }, + { + "epoch": 0.7114044737807114, + "grad_norm": 0.34758689999580383, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 970 + }, + { + "epoch": 0.7187385405207187, + "grad_norm": 0.831749439239502, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 980 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.4438304007053375, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 990 + }, + { + "epoch": 0.7334066740007334, + "grad_norm": 0.33840006589889526, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 1000 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3454797863960266, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1010 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.38999441266059875, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 1020 + }, + { + "epoch": 0.7554088742207554, + "grad_norm": 0.2829911708831787, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1030 + }, + { + "epoch": 0.7627429409607628, + "grad_norm": 0.36918163299560547, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 1040 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3415680229663849, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 1050 + }, + { + "epoch": 0.7774110744407774, + "grad_norm": 0.2974182963371277, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 1060 + }, + { + "epoch": 0.7847451411807848, + "grad_norm": 0.3880919814109802, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 1070 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.33503302931785583, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 1080 + }, + { + "epoch": 0.7994132746607994, + "grad_norm": 0.3728407025337219, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1090 + }, + { + "epoch": 0.8067473414008067, + "grad_norm": 0.3509373664855957, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 1100 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.42228564620018005, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 1110 + }, + { + "epoch": 0.8214154748808215, + "grad_norm": 0.313467800617218, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 1120 + }, + { + "epoch": 0.8287495416208287, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 1130 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.43200382590293884, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 1140 + }, + { + "epoch": 0.8434176751008434, + "grad_norm": 0.3309599459171295, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 1150 + }, + { + "epoch": 0.8507517418408508, + "grad_norm": 0.3526846170425415, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1160 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 1.2722247838974, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 1170 + }, + { + "epoch": 0.8654198753208654, + "grad_norm": 0.34142059087753296, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 1180 + }, + { + "epoch": 0.8727539420608728, + "grad_norm": 0.3805823028087616, + "learning_rate": 0.0002, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.3931232690811157, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 1200 + }, + { + "epoch": 0.8874220755408874, + "grad_norm": 0.2937372624874115, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 1210 + }, + { + "epoch": 0.8947561422808947, + "grad_norm": 0.3757196366786957, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 1220 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.3502705991268158, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 1230 + }, + { + "epoch": 0.9094242757609095, + "grad_norm": 0.32758915424346924, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 1240 + }, + { + "epoch": 0.9167583425009168, + "grad_norm": 0.37199416756629944, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 1250 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.3551490604877472, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 1260 + }, + { + "epoch": 0.9314264759809314, + "grad_norm": 0.2859550714492798, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 1270 + }, + { + "epoch": 0.9387605427209388, + "grad_norm": 0.427990585565567, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 1280 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.33717992901802063, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 1290 + }, + { + "epoch": 0.9534286762009534, + "grad_norm": 0.30225634574890137, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 1300 + }, + { + "epoch": 0.9607627429409608, + "grad_norm": 0.385821133852005, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 1310 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.9754308764209755, + "grad_norm": 0.49987098574638367, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 1330 + }, + { + "epoch": 0.9827649431609827, + "grad_norm": 0.3842747211456299, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 1340 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.6274653673171997, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1350 + }, + { + "epoch": 0.9974330766409975, + "grad_norm": 0.5239808559417725, + "learning_rate": 0.0002, + "loss": 1.124, + "step": 1360 + }, + { + "epoch": 0.9996332966629996, + "eval_loss": 1.1822267770767212, + "eval_runtime": 32.7389, + "eval_samples_per_second": 13.165, + "eval_steps_per_second": 1.649, + "step": 1363 + }, + { + "epoch": 1.0047671433810048, + "grad_norm": 0.45311301946640015, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 1370 + }, + { + "epoch": 1.012101210121012, + "grad_norm": 0.29685574769973755, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1380 + }, + { + "epoch": 1.0194352768610195, + "grad_norm": 0.3290937840938568, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 1390 + }, + { + "epoch": 1.0267693436010268, + "grad_norm": 0.3801758587360382, + "learning_rate": 0.0002, + "loss": 1.0295, + "step": 1400 + }, + { + "epoch": 1.034103410341034, + "grad_norm": 0.794174313545227, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 1410 + }, + { + "epoch": 1.0414374770810415, + "grad_norm": 0.3854154646396637, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 1420 + }, + { + "epoch": 1.0487715438210488, + "grad_norm": 0.32702451944351196, + "learning_rate": 0.0002, + "loss": 1.0652, + "step": 1430 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 0.7815203666687012, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 1440 + }, + { + "epoch": 1.0634396773010635, + "grad_norm": 0.3087436854839325, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1450 + }, + { + "epoch": 1.0707737440410707, + "grad_norm": 0.3847602903842926, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 1460 + }, + { + "epoch": 1.0781078107810782, + "grad_norm": 0.3693031370639801, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1470 + }, + { + "epoch": 1.0854418775210855, + "grad_norm": 0.4111202359199524, + "learning_rate": 0.0002, + "loss": 1.0995, + "step": 1480 + }, + { + "epoch": 1.0927759442610927, + "grad_norm": 0.41452381014823914, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 1490 + }, + { + "epoch": 1.1001100110011002, + "grad_norm": 0.3336445093154907, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 1500 + }, + { + "epoch": 1.1074440777411074, + "grad_norm": 0.3923407793045044, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 1510 + }, + { + "epoch": 1.1147781444811147, + "grad_norm": 0.46215683221817017, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 1520 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.3592156767845154, + "learning_rate": 0.0002, + "loss": 1.1133, + "step": 1530 + }, + { + "epoch": 1.1294462779611294, + "grad_norm": 0.361110657453537, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 1540 + }, + { + "epoch": 1.1367803447011369, + "grad_norm": 0.5317131280899048, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 1550 + }, + { + "epoch": 1.1441144114411441, + "grad_norm": 0.3882388174533844, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 1560 + }, + { + "epoch": 1.1514484781811514, + "grad_norm": 0.3259428143501282, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 1570 + }, + { + "epoch": 1.1587825449211588, + "grad_norm": 0.410935640335083, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 1580 + }, + { + "epoch": 1.166116611661166, + "grad_norm": 0.44940185546875, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 1590 + }, + { + "epoch": 1.1734506784011733, + "grad_norm": 0.5106484293937683, + "learning_rate": 0.0002, + "loss": 1.0334, + "step": 1600 + }, + { + "epoch": 1.1807847451411808, + "grad_norm": 0.6603665947914124, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 1610 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.4799964129924774, + "learning_rate": 0.0002, + "loss": 1.1227, + "step": 1620 + }, + { + "epoch": 1.1954528786211955, + "grad_norm": 0.4389883279800415, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 1630 + }, + { + "epoch": 1.2027869453612028, + "grad_norm": 0.4188813269138336, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 1640 + }, + { + "epoch": 1.21012101210121, + "grad_norm": 0.7132157683372498, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 1650 + }, + { + "epoch": 1.2174550788412175, + "grad_norm": 0.507480263710022, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1660 + }, + { + "epoch": 1.2247891455812248, + "grad_norm": 0.9452332854270935, + "learning_rate": 0.0002, + "loss": 0.9948, + "step": 1670 + }, + { + "epoch": 1.2321232123212322, + "grad_norm": 0.4121614992618561, + "learning_rate": 0.0002, + "loss": 1.0228, + "step": 1680 + }, + { + "epoch": 1.2394572790612395, + "grad_norm": 0.34230247139930725, + "learning_rate": 0.0002, + "loss": 1.0366, + "step": 1690 + }, + { + "epoch": 1.2467913458012467, + "grad_norm": 0.4026208817958832, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 1700 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.46673697233200073, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1710 + }, + { + "epoch": 1.2614594792812615, + "grad_norm": 0.38349825143814087, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 1720 + }, + { + "epoch": 1.2687935460212687, + "grad_norm": 0.4049997627735138, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 1730 + }, + { + "epoch": 1.2761276127612762, + "grad_norm": 0.3417615294456482, + "learning_rate": 0.0002, + "loss": 0.9504, + "step": 1740 + }, + { + "epoch": 1.2834616795012834, + "grad_norm": 0.4277614951133728, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 1750 + }, + { + "epoch": 1.2907957462412907, + "grad_norm": 0.5864202976226807, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 1760 + }, + { + "epoch": 1.2981298129812981, + "grad_norm": 0.7097493410110474, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 1770 + }, + { + "epoch": 1.3054638797213054, + "grad_norm": 0.3145381212234497, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 1780 + }, + { + "epoch": 1.3127979464613129, + "grad_norm": 0.5116165280342102, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 1790 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 0.7469736337661743, + "learning_rate": 0.0002, + "loss": 1.0765, + "step": 1800 + }, + { + "epoch": 1.3274660799413276, + "grad_norm": 0.32272255420684814, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 1810 + }, + { + "epoch": 1.3348001466813348, + "grad_norm": 0.3534623086452484, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 1820 + }, + { + "epoch": 1.342134213421342, + "grad_norm": 0.36127907037734985, + "learning_rate": 0.0002, + "loss": 1.1628, + "step": 1830 + }, + { + "epoch": 1.3494682801613496, + "grad_norm": 0.4072401523590088, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 1840 + }, + { + "epoch": 1.3568023469013568, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 1850 + }, + { + "epoch": 1.364136413641364, + "grad_norm": 0.412883460521698, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 1860 + }, + { + "epoch": 1.3714704803813715, + "grad_norm": 0.3735875189304352, + "learning_rate": 0.0002, + "loss": 1.0265, + "step": 1870 + }, + { + "epoch": 1.3788045471213788, + "grad_norm": 0.39158159494400024, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 1880 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.44431769847869873, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 1890 + }, + { + "epoch": 1.3934726806013935, + "grad_norm": 0.37772801518440247, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 1900 + }, + { + "epoch": 1.4008067473414008, + "grad_norm": 0.4056641757488251, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 1910 + }, + { + "epoch": 1.408140814081408, + "grad_norm": 0.41612377762794495, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 1920 + }, + { + "epoch": 1.4154748808214155, + "grad_norm": 0.41153013706207275, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 1930 + }, + { + "epoch": 1.4228089475614227, + "grad_norm": 0.387845516204834, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1940 + }, + { + "epoch": 1.4301430143014302, + "grad_norm": 0.3809587061405182, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 1950 + }, + { + "epoch": 1.4374770810414375, + "grad_norm": 0.3625726103782654, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1960 + }, + { + "epoch": 1.444811147781445, + "grad_norm": 0.5294290781021118, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1970 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 0.39975494146347046, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 1980 + }, + { + "epoch": 1.4594792812614594, + "grad_norm": 0.4181167185306549, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 1990 + }, + { + "epoch": 1.466813348001467, + "grad_norm": 0.42001503705978394, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 2000 + }, + { + "epoch": 1.4741474147414741, + "grad_norm": 0.4877578616142273, + "learning_rate": 0.0002, + "loss": 1.1266, + "step": 2010 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.4050969183444977, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 2020 + }, + { + "epoch": 1.4888155482214889, + "grad_norm": 0.39068883657455444, + "learning_rate": 0.0002, + "loss": 1.0562, + "step": 2030 + }, + { + "epoch": 1.4961496149614961, + "grad_norm": 0.421282559633255, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 2040 + }, + { + "epoch": 1.5034836817015034, + "grad_norm": 0.47092297673225403, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 2050 + }, + { + "epoch": 1.5108177484415108, + "grad_norm": 0.39688974618911743, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 2060 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.5529879331588745, + "learning_rate": 0.0002, + "loss": 1.08, + "step": 2070 + }, + { + "epoch": 1.5254858819215253, + "grad_norm": 0.4879782199859619, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 2080 + }, + { + "epoch": 1.5328199486615328, + "grad_norm": 0.5517361164093018, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 2090 + }, + { + "epoch": 1.5401540154015403, + "grad_norm": 0.44015637040138245, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2100 + }, + { + "epoch": 1.5474880821415475, + "grad_norm": 0.5435167551040649, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 2110 + }, + { + "epoch": 1.5548221488815548, + "grad_norm": 0.5714033246040344, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 2120 + }, + { + "epoch": 1.5621562156215623, + "grad_norm": 0.31732529401779175, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 2130 + }, + { + "epoch": 1.5694902823615695, + "grad_norm": 0.49068278074264526, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 2140 + }, + { + "epoch": 1.5768243491015768, + "grad_norm": 0.46851542592048645, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 2150 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.5083092451095581, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 2160 + }, + { + "epoch": 1.5914924825815915, + "grad_norm": 0.9822936058044434, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 2170 + }, + { + "epoch": 1.5988265493215987, + "grad_norm": 0.4575989246368408, + "learning_rate": 0.0002, + "loss": 0.9986, + "step": 2180 + }, + { + "epoch": 1.6061606160616062, + "grad_norm": 0.47444286942481995, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 2190 + }, + { + "epoch": 1.6134946828016135, + "grad_norm": 0.7208226919174194, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 2200 + }, + { + "epoch": 1.6208287495416207, + "grad_norm": 0.43791481852531433, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 2210 + }, + { + "epoch": 1.6281628162816282, + "grad_norm": 0.5245792865753174, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 2220 + }, + { + "epoch": 1.6354968830216357, + "grad_norm": 0.39289429783821106, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 2230 + }, + { + "epoch": 1.6428309497616427, + "grad_norm": 0.6106135845184326, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 2240 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.3722580671310425, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 2250 + }, + { + "epoch": 1.6574990832416576, + "grad_norm": 0.3649403750896454, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2260 + }, + { + "epoch": 1.6648331499816649, + "grad_norm": 0.46514248847961426, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 2270 + }, + { + "epoch": 1.6721672167216721, + "grad_norm": 0.42034927010536194, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 2280 + }, + { + "epoch": 1.6795012834616796, + "grad_norm": 0.45202910900115967, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 2290 + }, + { + "epoch": 1.6868353502016868, + "grad_norm": 0.36257603764533997, + "learning_rate": 0.0002, + "loss": 1.0866, + "step": 2300 + }, + { + "epoch": 1.694169416941694, + "grad_norm": 0.6340323090553284, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 2310 + }, + { + "epoch": 1.7015034836817016, + "grad_norm": 0.4352878928184509, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 2320 + }, + { + "epoch": 1.7088375504217088, + "grad_norm": 0.45029792189598083, + "learning_rate": 0.0002, + "loss": 1.0629, + "step": 2330 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.3891315758228302, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 2340 + }, + { + "epoch": 1.7235056839017235, + "grad_norm": 0.35180050134658813, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2350 + }, + { + "epoch": 1.7308397506417308, + "grad_norm": 0.42367449402809143, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 2360 + }, + { + "epoch": 1.738173817381738, + "grad_norm": 0.4553675353527069, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 2370 + }, + { + "epoch": 1.7455078841217455, + "grad_norm": 0.5944654941558838, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 2380 + }, + { + "epoch": 1.752841950861753, + "grad_norm": 0.3479664623737335, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 2390 + }, + { + "epoch": 1.76017601760176, + "grad_norm": 0.3585502505302429, + "learning_rate": 0.0002, + "loss": 1.0798, + "step": 2400 + }, + { + "epoch": 1.7675100843417675, + "grad_norm": 0.4263346493244171, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 2410 + }, + { + "epoch": 1.774844151081775, + "grad_norm": 0.5476409196853638, + "learning_rate": 0.0002, + "loss": 1.054, + "step": 2420 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 0.3694186508655548, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 2430 + }, + { + "epoch": 1.7895122845617895, + "grad_norm": 0.9185658693313599, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 2440 + }, + { + "epoch": 1.796846351301797, + "grad_norm": 0.7171908020973206, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2450 + }, + { + "epoch": 1.8041804180418042, + "grad_norm": 0.550658643245697, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 2460 + }, + { + "epoch": 1.8115144847818114, + "grad_norm": 0.4075568914413452, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2470 + }, + { + "epoch": 1.818848551521819, + "grad_norm": 0.3790127635002136, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 2480 + }, + { + "epoch": 1.8261826182618262, + "grad_norm": 0.3576384484767914, + "learning_rate": 0.0002, + "loss": 0.9839, + "step": 2490 + }, + { + "epoch": 1.8335166850018334, + "grad_norm": 0.3919370770454407, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 2500 + }, + { + "epoch": 1.8408507517418409, + "grad_norm": 0.485083669424057, + "learning_rate": 0.0002, + "loss": 0.9985, + "step": 2510 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.4564347565174103, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 2520 + }, + { + "epoch": 1.8555188852218554, + "grad_norm": 0.3613106608390808, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 2530 + }, + { + "epoch": 1.8628529519618628, + "grad_norm": 0.39600759744644165, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 2540 + }, + { + "epoch": 1.8701870187018703, + "grad_norm": 1.123499870300293, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 2550 + }, + { + "epoch": 1.8775210854418776, + "grad_norm": 0.4612680673599243, + "learning_rate": 0.0002, + "loss": 1.0635, + "step": 2560 + }, + { + "epoch": 1.8848551521818848, + "grad_norm": 0.42745399475097656, + "learning_rate": 0.0002, + "loss": 1.0087, + "step": 2570 + }, + { + "epoch": 1.8921892189218923, + "grad_norm": 0.4055580198764801, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 2580 + }, + { + "epoch": 1.8995232856618995, + "grad_norm": 0.44174644351005554, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 2590 + }, + { + "epoch": 1.9068573524019068, + "grad_norm": 1.0228385925292969, + "learning_rate": 0.0002, + "loss": 0.9886, + "step": 2600 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.3496396243572235, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 2610 + }, + { + "epoch": 1.9215254858819215, + "grad_norm": 0.4191173017024994, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2620 + }, + { + "epoch": 1.9288595526219288, + "grad_norm": 0.6778554916381836, + "learning_rate": 0.0002, + "loss": 1.0943, + "step": 2630 + }, + { + "epoch": 1.9361936193619362, + "grad_norm": 0.41992834210395813, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 2640 + }, + { + "epoch": 1.9435276861019435, + "grad_norm": 0.8760401010513306, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 2650 + }, + { + "epoch": 1.9508617528419507, + "grad_norm": 0.44049209356307983, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 2660 + }, + { + "epoch": 1.9581958195819582, + "grad_norm": 0.5651928782463074, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 2670 + }, + { + "epoch": 1.9655298863219657, + "grad_norm": 0.5292727947235107, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 2680 + }, + { + "epoch": 1.9728639530619727, + "grad_norm": 0.6012240648269653, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 2690 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.3945149779319763, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 2700 + }, + { + "epoch": 1.9875320865419877, + "grad_norm": 0.5732627511024475, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 2710 + }, + { + "epoch": 1.994866153281995, + "grad_norm": 0.3963361084461212, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 2720 + }, + { + "epoch": 2.0, + "eval_loss": 1.1534006595611572, + "eval_runtime": 32.7541, + "eval_samples_per_second": 13.159, + "eval_steps_per_second": 1.649, + "step": 2727 + }, + { + "epoch": 2.002200220022002, + "grad_norm": 0.48628315329551697, + "learning_rate": 0.0002, + "loss": 0.9624, + "step": 2730 + }, + { + "epoch": 2.0095342867620096, + "grad_norm": 0.413875013589859, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 2740 + }, + { + "epoch": 2.0168683535020167, + "grad_norm": 0.4988735616207123, + "learning_rate": 0.0002, + "loss": 0.965, + "step": 2750 + }, + { + "epoch": 2.024202420242024, + "grad_norm": 0.5634812712669373, + "learning_rate": 0.0002, + "loss": 0.9677, + "step": 2760 + }, + { + "epoch": 2.0315364869820316, + "grad_norm": 0.48302653431892395, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 2770 + }, + { + "epoch": 2.038870553722039, + "grad_norm": 0.49914175271987915, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 2780 + }, + { + "epoch": 2.046204620462046, + "grad_norm": 1.14039945602417, + "learning_rate": 0.0002, + "loss": 0.904, + "step": 2790 + }, + { + "epoch": 2.0535386872020536, + "grad_norm": 0.6359720826148987, + "learning_rate": 0.0002, + "loss": 0.9588, + "step": 2800 + }, + { + "epoch": 2.060872753942061, + "grad_norm": 0.4589158296585083, + "learning_rate": 0.0002, + "loss": 0.9031, + "step": 2810 + }, + { + "epoch": 2.068206820682068, + "grad_norm": 0.46255481243133545, + "learning_rate": 0.0002, + "loss": 0.9438, + "step": 2820 + }, + { + "epoch": 2.0755408874220755, + "grad_norm": 0.6232137680053711, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 2830 + }, + { + "epoch": 2.082874954162083, + "grad_norm": 0.41042178869247437, + "learning_rate": 0.0002, + "loss": 0.8978, + "step": 2840 + }, + { + "epoch": 2.09020902090209, + "grad_norm": 0.5334428548812866, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 2850 + }, + { + "epoch": 2.0975430876420975, + "grad_norm": 0.8270058631896973, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 2860 + }, + { + "epoch": 2.104877154382105, + "grad_norm": 0.6624533534049988, + "learning_rate": 0.0002, + "loss": 1.0064, + "step": 2870 + }, + { + "epoch": 2.112211221122112, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0002, + "loss": 0.9196, + "step": 2880 + }, + { + "epoch": 2.1195452878621195, + "grad_norm": 0.621482789516449, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 2890 + }, + { + "epoch": 2.126879354602127, + "grad_norm": 0.4556255340576172, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2900 + }, + { + "epoch": 2.1342134213421344, + "grad_norm": 0.4620579183101654, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 2910 + }, + { + "epoch": 2.1415474880821415, + "grad_norm": 0.9602415561676025, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2920 + }, + { + "epoch": 2.148881554822149, + "grad_norm": 0.587943971157074, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 2930 + }, + { + "epoch": 2.1562156215621564, + "grad_norm": 0.5121372938156128, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 2940 + }, + { + "epoch": 2.1635496883021634, + "grad_norm": 0.49424484372138977, + "learning_rate": 0.0002, + "loss": 0.8751, + "step": 2950 + }, + { + "epoch": 2.170883755042171, + "grad_norm": 0.6312560439109802, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2960 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.5235576629638672, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2970 + }, + { + "epoch": 2.1855518885221854, + "grad_norm": 0.5868439674377441, + "learning_rate": 0.0002, + "loss": 0.9706, + "step": 2980 + }, + { + "epoch": 2.192885955262193, + "grad_norm": 0.42302873730659485, + "learning_rate": 0.0002, + "loss": 0.9338, + "step": 2990 + }, + { + "epoch": 2.2002200220022003, + "grad_norm": 0.5097725987434387, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 3000 + }, + { + "epoch": 2.2075540887422074, + "grad_norm": 0.5091572403907776, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 3010 + }, + { + "epoch": 2.214888155482215, + "grad_norm": 0.49433162808418274, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 3020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5577368140220642, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 3030 + }, + { + "epoch": 2.2295562889622293, + "grad_norm": 0.6177583932876587, + "learning_rate": 0.0002, + "loss": 0.9033, + "step": 3040 + }, + { + "epoch": 2.236890355702237, + "grad_norm": 0.5256719589233398, + "learning_rate": 0.0002, + "loss": 0.9882, + "step": 3050 + }, + { + "epoch": 2.2442244224422443, + "grad_norm": 0.5001118183135986, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3060 + }, + { + "epoch": 2.2515584891822513, + "grad_norm": 0.5721249580383301, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3070 + }, + { + "epoch": 2.258892555922259, + "grad_norm": 0.5325384140014648, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 3080 + }, + { + "epoch": 2.2662266226622663, + "grad_norm": 0.5719189047813416, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 3090 + }, + { + "epoch": 2.2735606894022737, + "grad_norm": 0.6337835788726807, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 3100 + }, + { + "epoch": 2.2808947561422808, + "grad_norm": 0.5381836891174316, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 3110 + }, + { + "epoch": 2.2882288228822882, + "grad_norm": 0.5408531427383423, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 3120 + }, + { + "epoch": 2.2955628896222957, + "grad_norm": 0.43705281615257263, + "learning_rate": 0.0002, + "loss": 1.0325, + "step": 3130 + }, + { + "epoch": 2.3028969563623027, + "grad_norm": 0.6454030275344849, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 3140 + }, + { + "epoch": 2.31023102310231, + "grad_norm": 0.686030387878418, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 3150 + }, + { + "epoch": 2.3175650898423177, + "grad_norm": 0.5123633146286011, + "learning_rate": 0.0002, + "loss": 0.9403, + "step": 3160 + }, + { + "epoch": 2.3248991565823247, + "grad_norm": 0.842506468296051, + "learning_rate": 0.0002, + "loss": 0.8834, + "step": 3170 + }, + { + "epoch": 2.332233223322332, + "grad_norm": 0.5193818807601929, + "learning_rate": 0.0002, + "loss": 1.0497, + "step": 3180 + }, + { + "epoch": 2.3395672900623397, + "grad_norm": 0.5634409189224243, + "learning_rate": 0.0002, + "loss": 0.9473, + "step": 3190 + }, + { + "epoch": 2.3469013568023467, + "grad_norm": 0.6475534439086914, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 3200 + }, + { + "epoch": 2.354235423542354, + "grad_norm": 1.1503914594650269, + "learning_rate": 0.0002, + "loss": 0.874, + "step": 3210 + }, + { + "epoch": 2.3615694902823616, + "grad_norm": 0.7234905362129211, + "learning_rate": 0.0002, + "loss": 0.9762, + "step": 3220 + }, + { + "epoch": 2.368903557022369, + "grad_norm": 0.664903461933136, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 3230 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.5453006625175476, + "learning_rate": 0.0002, + "loss": 0.9987, + "step": 3240 + }, + { + "epoch": 2.3835716905023836, + "grad_norm": 0.6256654262542725, + "learning_rate": 0.0002, + "loss": 0.9742, + "step": 3250 + }, + { + "epoch": 2.390905757242391, + "grad_norm": 0.5166565179824829, + "learning_rate": 0.0002, + "loss": 0.9922, + "step": 3260 + }, + { + "epoch": 2.398239823982398, + "grad_norm": 0.5699098110198975, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 3270 + }, + { + "epoch": 2.4055738907224056, + "grad_norm": 0.4472540020942688, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 3280 + }, + { + "epoch": 2.412907957462413, + "grad_norm": 0.6790403127670288, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3290 + }, + { + "epoch": 2.42024202420242, + "grad_norm": 0.5182185173034668, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 3300 + }, + { + "epoch": 2.4275760909424275, + "grad_norm": 0.564647912979126, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 3310 + }, + { + "epoch": 2.434910157682435, + "grad_norm": 0.5625313520431519, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 3320 + }, + { + "epoch": 2.442244224422442, + "grad_norm": 0.7496559619903564, + "learning_rate": 0.0002, + "loss": 0.8798, + "step": 3330 + }, + { + "epoch": 2.4495782911624495, + "grad_norm": 0.4779128134250641, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 2.456912357902457, + "grad_norm": 0.578093409538269, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 3350 + }, + { + "epoch": 2.4642464246424645, + "grad_norm": 0.5456080436706543, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 3360 + }, + { + "epoch": 2.4715804913824715, + "grad_norm": 0.4769273102283478, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 3370 + }, + { + "epoch": 2.478914558122479, + "grad_norm": 0.5608189702033997, + "learning_rate": 0.0002, + "loss": 0.9312, + "step": 3380 + }, + { + "epoch": 2.4862486248624864, + "grad_norm": 0.5590165853500366, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 3390 + }, + { + "epoch": 2.4935826916024935, + "grad_norm": 0.801306962966919, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 3400 + }, + { + "epoch": 2.500916758342501, + "grad_norm": 0.6045624613761902, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 3410 + }, + { + "epoch": 2.5082508250825084, + "grad_norm": 0.5735858082771301, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 3420 + }, + { + "epoch": 2.5155848918225154, + "grad_norm": 0.6827309131622314, + "learning_rate": 0.0002, + "loss": 0.9846, + "step": 3430 + }, + { + "epoch": 2.522918958562523, + "grad_norm": 0.5702602863311768, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 3440 + }, + { + "epoch": 2.5302530253025304, + "grad_norm": 0.6674721240997314, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 3450 + }, + { + "epoch": 2.5375870920425374, + "grad_norm": 0.5635907649993896, + "learning_rate": 0.0002, + "loss": 0.914, + "step": 3460 + }, + { + "epoch": 2.544921158782545, + "grad_norm": 0.42737770080566406, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 3470 + }, + { + "epoch": 2.5522552255225524, + "grad_norm": 0.6720691919326782, + "learning_rate": 0.0002, + "loss": 0.9474, + "step": 3480 + }, + { + "epoch": 2.55958929226256, + "grad_norm": 0.8917084336280823, + "learning_rate": 0.0002, + "loss": 0.8637, + "step": 3490 + }, + { + "epoch": 2.566923359002567, + "grad_norm": 0.5134549140930176, + "learning_rate": 0.0002, + "loss": 0.9257, + "step": 3500 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.4951367974281311, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 3510 + }, + { + "epoch": 2.5815914924825814, + "grad_norm": 0.9438204765319824, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 3520 + }, + { + "epoch": 2.588925559222589, + "grad_norm": 0.6024714708328247, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 3530 + }, + { + "epoch": 2.5962596259625963, + "grad_norm": 0.5248535871505737, + "learning_rate": 0.0002, + "loss": 0.9298, + "step": 3540 + }, + { + "epoch": 2.6035936927026038, + "grad_norm": 0.8677568435668945, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 3550 + }, + { + "epoch": 2.610927759442611, + "grad_norm": 0.82008296251297, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 3560 + }, + { + "epoch": 2.6182618261826183, + "grad_norm": 0.4724634885787964, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 3570 + }, + { + "epoch": 2.6255958929226257, + "grad_norm": 0.5434244275093079, + "learning_rate": 0.0002, + "loss": 0.9058, + "step": 3580 + }, + { + "epoch": 2.6329299596626328, + "grad_norm": 0.4948740005493164, + "learning_rate": 0.0002, + "loss": 0.9379, + "step": 3590 + }, + { + "epoch": 2.6402640264026402, + "grad_norm": 0.42109328508377075, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3600 + }, + { + "epoch": 2.6475980931426477, + "grad_norm": 0.7979786396026611, + "learning_rate": 0.0002, + "loss": 0.9809, + "step": 3610 + }, + { + "epoch": 2.654932159882655, + "grad_norm": 0.6345919370651245, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 3620 + }, + { + "epoch": 2.662266226622662, + "grad_norm": 0.4971671402454376, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 3630 + }, + { + "epoch": 2.6696002933626697, + "grad_norm": 0.6467748284339905, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 3640 + }, + { + "epoch": 2.6769343601026767, + "grad_norm": 0.4240160286426544, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 3650 + }, + { + "epoch": 2.684268426842684, + "grad_norm": 0.5179754495620728, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3660 + }, + { + "epoch": 2.6916024935826917, + "grad_norm": 0.754012405872345, + "learning_rate": 0.0002, + "loss": 0.9221, + "step": 3670 + }, + { + "epoch": 2.698936560322699, + "grad_norm": 0.5141299962997437, + "learning_rate": 0.0002, + "loss": 0.9194, + "step": 3680 + }, + { + "epoch": 2.706270627062706, + "grad_norm": 0.5737819075584412, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 3690 + }, + { + "epoch": 2.7136046938027136, + "grad_norm": 0.5887577533721924, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 3700 + }, + { + "epoch": 2.720938760542721, + "grad_norm": 0.6740471720695496, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 3710 + }, + { + "epoch": 2.728272827282728, + "grad_norm": 0.5879453420639038, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 3720 + }, + { + "epoch": 2.7356068940227356, + "grad_norm": 0.4858354926109314, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 3730 + }, + { + "epoch": 2.742940960762743, + "grad_norm": 0.5489001870155334, + "learning_rate": 0.0002, + "loss": 0.9308, + "step": 3740 + }, + { + "epoch": 2.7502750275027505, + "grad_norm": 0.8187092542648315, + "learning_rate": 0.0002, + "loss": 0.894, + "step": 3750 + }, + { + "epoch": 2.7576090942427576, + "grad_norm": 0.5666626691818237, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 3760 + }, + { + "epoch": 2.764943160982765, + "grad_norm": 0.5377066135406494, + "learning_rate": 0.0002, + "loss": 1.0059, + "step": 3770 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.566330075263977, + "learning_rate": 0.0002, + "loss": 0.9132, + "step": 3780 + }, + { + "epoch": 2.7796112944627795, + "grad_norm": 0.5522832870483398, + "learning_rate": 0.0002, + "loss": 0.9415, + "step": 3790 + }, + { + "epoch": 2.786945361202787, + "grad_norm": 0.5668695569038391, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 3800 + }, + { + "epoch": 2.7942794279427945, + "grad_norm": 0.7566602826118469, + "learning_rate": 0.0002, + "loss": 0.8885, + "step": 3810 + }, + { + "epoch": 2.8016134946828015, + "grad_norm": 0.5603684782981873, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 3820 + }, + { + "epoch": 2.808947561422809, + "grad_norm": 0.49122217297554016, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 3830 + }, + { + "epoch": 2.816281628162816, + "grad_norm": 0.6798251867294312, + "learning_rate": 0.0002, + "loss": 0.9738, + "step": 3840 + }, + { + "epoch": 2.8236156949028235, + "grad_norm": 0.6097991466522217, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 3850 + }, + { + "epoch": 2.830949761642831, + "grad_norm": 0.6675726175308228, + "learning_rate": 0.0002, + "loss": 0.8672, + "step": 3860 + }, + { + "epoch": 2.8382838283828384, + "grad_norm": 0.9223952889442444, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 3870 + }, + { + "epoch": 2.8456178951228455, + "grad_norm": 0.6020799875259399, + "learning_rate": 0.0002, + "loss": 0.8767, + "step": 3880 + }, + { + "epoch": 2.852951961862853, + "grad_norm": 0.5206381678581238, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3890 + }, + { + "epoch": 2.8602860286028604, + "grad_norm": 0.6268777251243591, + "learning_rate": 0.0002, + "loss": 0.9479, + "step": 3900 + }, + { + "epoch": 2.8676200953428674, + "grad_norm": 1.1583497524261475, + "learning_rate": 0.0002, + "loss": 0.9409, + "step": 3910 + }, + { + "epoch": 2.874954162082875, + "grad_norm": 0.7263903021812439, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 3920 + }, + { + "epoch": 2.8822882288228824, + "grad_norm": 0.5369910001754761, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 3930 + }, + { + "epoch": 2.88962229556289, + "grad_norm": 0.7298350930213928, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 3940 + }, + { + "epoch": 2.896956362302897, + "grad_norm": 0.577012836933136, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 3950 + }, + { + "epoch": 2.9042904290429044, + "grad_norm": 0.5859594345092773, + "learning_rate": 0.0002, + "loss": 0.9716, + "step": 3960 + }, + { + "epoch": 2.9116244957829114, + "grad_norm": 0.47176122665405273, + "learning_rate": 0.0002, + "loss": 0.8772, + "step": 3970 + }, + { + "epoch": 2.918958562522919, + "grad_norm": 0.9699620604515076, + "learning_rate": 0.0002, + "loss": 0.8997, + "step": 3980 + }, + { + "epoch": 2.9262926292629263, + "grad_norm": 0.7908747792243958, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3990 + }, + { + "epoch": 2.933626696002934, + "grad_norm": 0.5777379274368286, + "learning_rate": 0.0002, + "loss": 0.9462, + "step": 4000 + }, + { + "epoch": 2.940960762742941, + "grad_norm": 0.599288284778595, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 4010 + }, + { + "epoch": 2.9482948294829483, + "grad_norm": 0.5232274532318115, + "learning_rate": 0.0002, + "loss": 0.9812, + "step": 4020 + }, + { + "epoch": 2.9556288962229558, + "grad_norm": 0.6395137310028076, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 4030 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.589260458946228, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 4040 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.5699581503868103, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 4050 + }, + { + "epoch": 2.9776310964429777, + "grad_norm": 0.528468132019043, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 4060 + }, + { + "epoch": 2.984965163182985, + "grad_norm": 0.4804670512676239, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 4070 + }, + { + "epoch": 2.9922992299229922, + "grad_norm": 1.1918889284133911, + "learning_rate": 0.0002, + "loss": 0.9771, + "step": 4080 + }, + { + "epoch": 2.9996332966629997, + "grad_norm": 0.5479103326797485, + "learning_rate": 0.0002, + "loss": 0.9178, + "step": 4090 + }, + { + "epoch": 2.9996332966629997, + "eval_loss": 1.1642853021621704, + "eval_runtime": 32.7511, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.649, + "step": 4090 + }, + { + "epoch": 3.006967363403007, + "grad_norm": 0.7430027723312378, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 4100 + }, + { + "epoch": 3.014301430143014, + "grad_norm": 0.6293647289276123, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4110 + }, + { + "epoch": 3.0216354968830217, + "grad_norm": 0.6191329956054688, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 4120 + }, + { + "epoch": 3.028969563623029, + "grad_norm": 0.7959313988685608, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4130 + }, + { + "epoch": 3.036303630363036, + "grad_norm": 0.5956351161003113, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 4140 + }, + { + "epoch": 3.0436376971030437, + "grad_norm": 0.670383632183075, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 4150 + }, + { + "epoch": 3.050971763843051, + "grad_norm": 0.6414518356323242, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 4160 + }, + { + "epoch": 3.058305830583058, + "grad_norm": 0.7928852438926697, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 4170 + }, + { + "epoch": 3.0656398973230656, + "grad_norm": 0.6211121082305908, + "learning_rate": 0.0002, + "loss": 0.7914, + "step": 4180 + }, + { + "epoch": 3.072973964063073, + "grad_norm": 0.6237057447433472, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 4190 + }, + { + "epoch": 3.08030803080308, + "grad_norm": 0.6522233486175537, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 4200 + }, + { + "epoch": 3.0876420975430876, + "grad_norm": 0.9396848678588867, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4210 + }, + { + "epoch": 3.094976164283095, + "grad_norm": 0.8003010749816895, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 4220 + }, + { + "epoch": 3.102310231023102, + "grad_norm": 0.6733810305595398, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 4230 + }, + { + "epoch": 3.1096442977631096, + "grad_norm": 0.6365828514099121, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 4240 + }, + { + "epoch": 3.116978364503117, + "grad_norm": 1.0805548429489136, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4250 + }, + { + "epoch": 3.1243124312431245, + "grad_norm": 0.7262141108512878, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4260 + }, + { + "epoch": 3.1316464979831315, + "grad_norm": 0.5500539541244507, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 4270 + }, + { + "epoch": 3.138980564723139, + "grad_norm": 0.793912947177887, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 4280 + }, + { + "epoch": 3.1463146314631465, + "grad_norm": 1.2540518045425415, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 4290 + }, + { + "epoch": 3.1536486982031535, + "grad_norm": 0.7020077705383301, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 4300 + }, + { + "epoch": 3.160982764943161, + "grad_norm": 0.5111123323440552, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 4310 + }, + { + "epoch": 3.1683168316831685, + "grad_norm": 0.7172090411186218, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 4320 + }, + { + "epoch": 3.1756508984231755, + "grad_norm": 0.6343168616294861, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 4330 + }, + { + "epoch": 3.182984965163183, + "grad_norm": 0.9563672542572021, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4340 + }, + { + "epoch": 3.1903190319031904, + "grad_norm": 1.0225574970245361, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4350 + }, + { + "epoch": 3.1976530986431975, + "grad_norm": 1.1633386611938477, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 4360 + }, + { + "epoch": 3.204987165383205, + "grad_norm": 0.8915148973464966, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 4370 + }, + { + "epoch": 3.2123212321232124, + "grad_norm": 0.9156812429428101, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4380 + }, + { + "epoch": 3.21965529886322, + "grad_norm": 0.6363258957862854, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 4390 + }, + { + "epoch": 3.226989365603227, + "grad_norm": 0.579099178314209, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 4400 + }, + { + "epoch": 3.2343234323432344, + "grad_norm": 0.8778146505355835, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 4410 + }, + { + "epoch": 3.241657499083242, + "grad_norm": 0.8356770873069763, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 4420 + }, + { + "epoch": 3.248991565823249, + "grad_norm": 0.702032208442688, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 4430 + }, + { + "epoch": 3.2563256325632564, + "grad_norm": 0.6386539340019226, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 4440 + }, + { + "epoch": 3.263659699303264, + "grad_norm": 0.7008408904075623, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 4450 + }, + { + "epoch": 3.270993766043271, + "grad_norm": 0.9556332230567932, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 4460 + }, + { + "epoch": 3.2783278327832783, + "grad_norm": 0.5667835474014282, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 4470 + }, + { + "epoch": 3.285661899523286, + "grad_norm": 0.8239172697067261, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 4480 + }, + { + "epoch": 3.292995966263293, + "grad_norm": 0.7045050859451294, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 4490 + }, + { + "epoch": 3.3003300330033003, + "grad_norm": 0.7131434082984924, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 4500 + }, + { + "epoch": 3.3076640997433078, + "grad_norm": 0.6924910545349121, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 4510 + }, + { + "epoch": 3.3149981664833152, + "grad_norm": 0.8945356607437134, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 4520 + }, + { + "epoch": 3.3223322332233223, + "grad_norm": 0.6546903252601624, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 4530 + }, + { + "epoch": 3.3296662999633297, + "grad_norm": 0.8206679224967957, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4540 + }, + { + "epoch": 3.3370003667033368, + "grad_norm": 0.6482203602790833, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 4550 + }, + { + "epoch": 3.3443344334433442, + "grad_norm": 0.7558760046958923, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 4560 + }, + { + "epoch": 3.3516685001833517, + "grad_norm": 0.7794756889343262, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 4570 + }, + { + "epoch": 3.359002566923359, + "grad_norm": 0.7382805943489075, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4580 + }, + { + "epoch": 3.366336633663366, + "grad_norm": 0.5912511944770813, + "learning_rate": 0.0002, + "loss": 0.8511, + "step": 4590 + }, + { + "epoch": 3.3736707004033737, + "grad_norm": 0.7444885969161987, + "learning_rate": 0.0002, + "loss": 0.8272, + "step": 4600 + }, + { + "epoch": 3.381004767143381, + "grad_norm": 0.7354922890663147, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 4610 + }, + { + "epoch": 3.388338833883388, + "grad_norm": 0.7685934901237488, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 4620 + }, + { + "epoch": 3.3956729006233957, + "grad_norm": 0.61041259765625, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 4630 + }, + { + "epoch": 3.403006967363403, + "grad_norm": 0.6820451021194458, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 4640 + }, + { + "epoch": 3.41034103410341, + "grad_norm": 0.5819534063339233, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 4650 + }, + { + "epoch": 3.4176751008434176, + "grad_norm": 0.705410897731781, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 4660 + }, + { + "epoch": 3.425009167583425, + "grad_norm": 0.8052892088890076, + "learning_rate": 0.0002, + "loss": 0.7901, + "step": 4670 + }, + { + "epoch": 3.432343234323432, + "grad_norm": 0.7746483087539673, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 4680 + }, + { + "epoch": 3.4396773010634396, + "grad_norm": 0.7713689804077148, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 4690 + }, + { + "epoch": 3.447011367803447, + "grad_norm": 0.810371994972229, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 4700 + }, + { + "epoch": 3.4543454345434546, + "grad_norm": 0.7702969312667847, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4710 + }, + { + "epoch": 3.4616795012834616, + "grad_norm": 0.7069268822669983, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4720 + }, + { + "epoch": 3.469013568023469, + "grad_norm": 0.7640359401702881, + "learning_rate": 0.0002, + "loss": 0.8199, + "step": 4730 + }, + { + "epoch": 3.4763476347634765, + "grad_norm": 0.8661707639694214, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 4740 + }, + { + "epoch": 3.4836817015034836, + "grad_norm": 0.9970282912254333, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 4750 + }, + { + "epoch": 3.491015768243491, + "grad_norm": 0.5824355483055115, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 4760 + }, + { + "epoch": 3.4983498349834985, + "grad_norm": 1.3072649240493774, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 4770 + }, + { + "epoch": 3.5056839017235055, + "grad_norm": 0.873978316783905, + "learning_rate": 0.0002, + "loss": 0.9101, + "step": 4780 + }, + { + "epoch": 3.513017968463513, + "grad_norm": 0.5526657104492188, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4790 + }, + { + "epoch": 3.5203520352035205, + "grad_norm": 0.790894627571106, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 4800 + }, + { + "epoch": 3.5276861019435275, + "grad_norm": 0.8119630217552185, + "learning_rate": 0.0002, + "loss": 0.831, + "step": 4810 + }, + { + "epoch": 3.535020168683535, + "grad_norm": 0.633212149143219, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 4820 + }, + { + "epoch": 3.5423542354235424, + "grad_norm": 0.703029990196228, + "learning_rate": 0.0002, + "loss": 0.8505, + "step": 4830 + }, + { + "epoch": 3.54968830216355, + "grad_norm": 0.7603771686553955, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 4840 + }, + { + "epoch": 3.557022368903557, + "grad_norm": 0.6260480880737305, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 4850 + }, + { + "epoch": 3.5643564356435644, + "grad_norm": 0.8203664422035217, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 4860 + }, + { + "epoch": 3.5716905023835714, + "grad_norm": 0.7793813347816467, + "learning_rate": 0.0002, + "loss": 0.8821, + "step": 4870 + }, + { + "epoch": 3.579024569123579, + "grad_norm": 0.7667397260665894, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 4880 + }, + { + "epoch": 3.5863586358635864, + "grad_norm": 0.8198829889297485, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 4890 + }, + { + "epoch": 3.593692702603594, + "grad_norm": 0.7689233422279358, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 4900 + }, + { + "epoch": 3.601026769343601, + "grad_norm": 0.7870983481407166, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 4910 + }, + { + "epoch": 3.6083608360836084, + "grad_norm": 0.8133853077888489, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 4920 + }, + { + "epoch": 3.615694902823616, + "grad_norm": 1.308401346206665, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 4930 + }, + { + "epoch": 3.623028969563623, + "grad_norm": 0.7131121754646301, + "learning_rate": 0.0002, + "loss": 0.8494, + "step": 4940 + }, + { + "epoch": 3.6303630363036303, + "grad_norm": 0.6825910210609436, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 4950 + }, + { + "epoch": 3.637697103043638, + "grad_norm": 0.7254678606987, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4960 + }, + { + "epoch": 3.6450311697836453, + "grad_norm": 0.8045085072517395, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4970 + }, + { + "epoch": 3.6523652365236523, + "grad_norm": 0.6991777420043945, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 4980 + }, + { + "epoch": 3.6596993032636598, + "grad_norm": 0.7804713249206543, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 4990 + }, + { + "epoch": 3.667033370003667, + "grad_norm": 0.8525708317756653, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 5000 + }, + { + "epoch": 3.6743674367436743, + "grad_norm": 0.7959994673728943, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 5010 + }, + { + "epoch": 3.6817015034836817, + "grad_norm": 0.8103628158569336, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5020 + }, + { + "epoch": 3.689035570223689, + "grad_norm": 0.7517836093902588, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 5030 + }, + { + "epoch": 3.6963696369636962, + "grad_norm": 0.6878514289855957, + "learning_rate": 0.0002, + "loss": 0.8375, + "step": 5040 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 1.2371820211410522, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 5050 + }, + { + "epoch": 3.711037770443711, + "grad_norm": 0.6567103862762451, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 5060 + }, + { + "epoch": 3.718371837183718, + "grad_norm": 1.1254922151565552, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 5070 + }, + { + "epoch": 3.7257059039237257, + "grad_norm": 0.6796132326126099, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 5080 + }, + { + "epoch": 3.733039970663733, + "grad_norm": 0.7285300493240356, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5090 + }, + { + "epoch": 3.7403740374037406, + "grad_norm": 0.8931500911712646, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 5100 + }, + { + "epoch": 3.7477081041437477, + "grad_norm": 0.6256856918334961, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 5110 + }, + { + "epoch": 3.755042170883755, + "grad_norm": 0.79310142993927, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5120 + }, + { + "epoch": 3.762376237623762, + "grad_norm": 0.6594041585922241, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 5130 + }, + { + "epoch": 3.7697103043637696, + "grad_norm": 0.7029327750205994, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 5140 + }, + { + "epoch": 3.777044371103777, + "grad_norm": 0.5880070328712463, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 5150 + }, + { + "epoch": 3.7843784378437846, + "grad_norm": 0.7578945159912109, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 5160 + }, + { + "epoch": 3.7917125045837916, + "grad_norm": 0.8276378512382507, + "learning_rate": 0.0002, + "loss": 0.8819, + "step": 5170 + }, + { + "epoch": 3.799046571323799, + "grad_norm": 0.7627953886985779, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 5180 + }, + { + "epoch": 3.806380638063806, + "grad_norm": 0.8169086575508118, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 5190 + }, + { + "epoch": 3.8137147048038136, + "grad_norm": 0.6605030298233032, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 5200 + }, + { + "epoch": 3.821048771543821, + "grad_norm": 0.5837286114692688, + "learning_rate": 0.0002, + "loss": 0.8804, + "step": 5210 + }, + { + "epoch": 3.8283828382838285, + "grad_norm": 1.2422157526016235, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 5220 + }, + { + "epoch": 3.8357169050238356, + "grad_norm": 0.6589220762252808, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 5230 + }, + { + "epoch": 3.843050971763843, + "grad_norm": 0.8567556142807007, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5240 + }, + { + "epoch": 3.8503850385038505, + "grad_norm": 0.6490627527236938, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 5250 + }, + { + "epoch": 3.8577191052438575, + "grad_norm": 0.620232880115509, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5260 + }, + { + "epoch": 3.865053171983865, + "grad_norm": 0.7685128450393677, + "learning_rate": 0.0002, + "loss": 0.9192, + "step": 5270 + }, + { + "epoch": 3.8723872387238725, + "grad_norm": 0.8113296627998352, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 5280 + }, + { + "epoch": 3.87972130546388, + "grad_norm": 0.8092675805091858, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 5290 + }, + { + "epoch": 3.887055372203887, + "grad_norm": 0.583570122718811, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 5300 + }, + { + "epoch": 3.8943894389438944, + "grad_norm": 1.712363600730896, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 5310 + }, + { + "epoch": 3.9017235056839015, + "grad_norm": 0.6673534512519836, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5320 + }, + { + "epoch": 3.909057572423909, + "grad_norm": 1.9770312309265137, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5330 + }, + { + "epoch": 3.9163916391639164, + "grad_norm": 0.6430999636650085, + "learning_rate": 0.0002, + "loss": 0.8793, + "step": 5340 + }, + { + "epoch": 3.923725705903924, + "grad_norm": 1.0159571170806885, + "learning_rate": 0.0002, + "loss": 0.839, + "step": 5350 + }, + { + "epoch": 3.931059772643931, + "grad_norm": 0.8607584834098816, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 5360 + }, + { + "epoch": 3.9383938393839384, + "grad_norm": 0.6967900991439819, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 5370 + }, + { + "epoch": 3.945727906123946, + "grad_norm": 0.7683077454566956, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 5380 + }, + { + "epoch": 3.953061972863953, + "grad_norm": 0.6805762648582458, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5390 + }, + { + "epoch": 3.9603960396039604, + "grad_norm": 0.7033619284629822, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5400 + }, + { + "epoch": 3.967730106343968, + "grad_norm": 0.966112494468689, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5410 + }, + { + "epoch": 3.9750641730839753, + "grad_norm": 0.8467881083488464, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 5420 + }, + { + "epoch": 3.9823982398239823, + "grad_norm": 0.8005317449569702, + "learning_rate": 0.0002, + "loss": 0.8084, + "step": 5430 + }, + { + "epoch": 3.98973230656399, + "grad_norm": 1.1615241765975952, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 5440 + }, + { + "epoch": 3.997066373303997, + "grad_norm": 0.6121614575386047, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 5450 + }, + { + "epoch": 4.0, + "eval_loss": 1.1834222078323364, + "eval_runtime": 32.7569, + "eval_samples_per_second": 13.158, + "eval_steps_per_second": 1.649, + "step": 5454 + } + ], + "logging_steps": 10, + "max_steps": 10904, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.5239890799938765e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-5454/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5bb16ab95ca096920c5d5b7350054e10507d7989 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e4f634851d34aa044ae96f13caf049e805ff844286cc74d8fab46081770f1af +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4dcb5cad10181ec10fa32fd6b3c7466201c6eda --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44ef702ec102cc98397673136cc4e894d33991fd0940b046e486268bfa17deae +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8475fa8f8b545c12e801def539ab1fd87bbece8c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6c5c3e36480b97449f54a91b6bb67c1ce97ff2182fabcb54951279599efded4 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b73922fca4c14f7bb64a94872248c0d53c64160f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2fccf9ed41af18b1ff88d3cd93d214635edb273f565215977f96017eaa5a6f +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..52f9aa15b7a2d8c6eee09b65ea501dc7b26b6e98 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/trainer_state.json @@ -0,0 +1,4840 @@ +{ + "best_metric": 1.1534006595611572, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", + "epoch": 4.999633296662999, + "eval_steps": 10, + "global_step": 6817, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007334066740007334, + "grad_norm": 0.47521963715553284, + "learning_rate": 0.0002, + "loss": 1.9722, + "step": 10 + }, + { + "epoch": 0.014668133480014669, + "grad_norm": 0.5395162105560303, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 20 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 0.4305780231952667, + "learning_rate": 0.0002, + "loss": 1.4202, + "step": 30 + }, + { + "epoch": 0.029336266960029337, + "grad_norm": 0.6938246488571167, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 40 + }, + { + "epoch": 0.03667033370003667, + "grad_norm": 1.5133819580078125, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 50 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 0.9173883199691772, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 60 + }, + { + "epoch": 0.05133846718005134, + "grad_norm": 0.4619861841201782, + "learning_rate": 0.0002, + "loss": 1.2844, + "step": 70 + }, + { + "epoch": 0.058672533920058674, + "grad_norm": 0.46118637919425964, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 80 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 0.4468648135662079, + "learning_rate": 0.0002, + "loss": 1.3441, + "step": 90 + }, + { + "epoch": 0.07334066740007333, + "grad_norm": 0.46123769879341125, + "learning_rate": 0.0002, + "loss": 1.1863, + "step": 100 + }, + { + "epoch": 0.08067473414008068, + "grad_norm": 0.4859139025211334, + "learning_rate": 0.0002, + "loss": 1.2772, + "step": 110 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 0.4384922385215759, + "learning_rate": 0.0002, + "loss": 1.2087, + "step": 120 + }, + { + "epoch": 0.09534286762009535, + "grad_norm": 0.39519360661506653, + "learning_rate": 0.0002, + "loss": 1.2927, + "step": 130 + }, + { + "epoch": 0.10267693436010268, + "grad_norm": 0.4049859344959259, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 140 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 0.4605638086795807, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 150 + }, + { + "epoch": 0.11734506784011735, + "grad_norm": 0.4201928377151489, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 160 + }, + { + "epoch": 0.12467913458012468, + "grad_norm": 0.5367777347564697, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 170 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 0.41752299666404724, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 180 + }, + { + "epoch": 0.13934726806013933, + "grad_norm": 0.31597763299942017, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 190 + }, + { + "epoch": 0.14668133480014667, + "grad_norm": 0.7468788623809814, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 200 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 0.3403034508228302, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 210 + }, + { + "epoch": 0.16134946828016136, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.1686835350201687, + "grad_norm": 0.356158971786499, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 230 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 0.3448857367038727, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 240 + }, + { + "epoch": 0.18335166850018336, + "grad_norm": 0.3475699722766876, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 250 + }, + { + "epoch": 0.1906857352401907, + "grad_norm": 0.2770358622074127, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 260 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.4310270845890045, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 270 + }, + { + "epoch": 0.20535386872020536, + "grad_norm": 0.335041880607605, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 280 + }, + { + "epoch": 0.2126879354602127, + "grad_norm": 0.3420602083206177, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 290 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 0.325001060962677, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 300 + }, + { + "epoch": 0.22735606894022736, + "grad_norm": 0.3027827739715576, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 310 + }, + { + "epoch": 0.2346901356802347, + "grad_norm": 0.435550719499588, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 320 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 0.3884522616863251, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 330 + }, + { + "epoch": 0.24935826916024936, + "grad_norm": 0.7736002206802368, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 340 + }, + { + "epoch": 0.2566923359002567, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0002, + "loss": 1.3606, + "step": 350 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.3311890959739685, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 360 + }, + { + "epoch": 0.27136046938027136, + "grad_norm": 0.7473500370979309, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 370 + }, + { + "epoch": 0.27869453612027867, + "grad_norm": 0.3681875765323639, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 380 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 0.3764737844467163, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 390 + }, + { + "epoch": 0.29336266960029334, + "grad_norm": 0.4243989586830139, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 400 + }, + { + "epoch": 0.3006967363403007, + "grad_norm": 0.2658531963825226, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 410 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 0.3436793386936188, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.31536486982031536, + "grad_norm": 0.5101129412651062, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 430 + }, + { + "epoch": 0.3226989365603227, + "grad_norm": 0.3319750726222992, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 440 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.385148286819458, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 450 + }, + { + "epoch": 0.3373670700403374, + "grad_norm": 0.3477935791015625, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 460 + }, + { + "epoch": 0.3447011367803447, + "grad_norm": 0.29748716950416565, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 470 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 0.34083324670791626, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 480 + }, + { + "epoch": 0.35936927026035936, + "grad_norm": 0.36904552578926086, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 490 + }, + { + "epoch": 0.3667033370003667, + "grad_norm": 0.315483033657074, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 500 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.44897955656051636, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 510 + }, + { + "epoch": 0.3813714704803814, + "grad_norm": 0.3160701394081116, + "learning_rate": 0.0002, + "loss": 1.3035, + "step": 520 + }, + { + "epoch": 0.3887055372203887, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 530 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5430002808570862, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 540 + }, + { + "epoch": 0.40337367070040336, + "grad_norm": 0.2908070683479309, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 550 + }, + { + "epoch": 0.4107077374404107, + "grad_norm": 0.35066530108451843, + "learning_rate": 0.0002, + "loss": 1.2384, + "step": 560 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.37588003277778625, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 570 + }, + { + "epoch": 0.4253758709204254, + "grad_norm": 0.3112126886844635, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 580 + }, + { + "epoch": 0.4327099376604327, + "grad_norm": 0.35577139258384705, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 590 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.31706422567367554, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 600 + }, + { + "epoch": 0.44737807114044736, + "grad_norm": 0.3249092102050781, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 610 + }, + { + "epoch": 0.4547121378804547, + "grad_norm": 0.3842705488204956, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 620 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.390991747379303, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 630 + }, + { + "epoch": 0.4693802713604694, + "grad_norm": 0.27532413601875305, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 640 + }, + { + "epoch": 0.4767143381004767, + "grad_norm": 0.31412816047668457, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 650 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.32117101550102234, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.49138247158049136, + "grad_norm": 0.3810010254383087, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 670 + }, + { + "epoch": 0.4987165383204987, + "grad_norm": 0.36289164423942566, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 680 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.34458720684051514, + "learning_rate": 0.0002, + "loss": 1.2034, + "step": 690 + }, + { + "epoch": 0.5133846718005134, + "grad_norm": 0.32844600081443787, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 700 + }, + { + "epoch": 0.5207187385405208, + "grad_norm": 0.3144175708293915, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 710 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.3898887634277344, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 720 + }, + { + "epoch": 0.5353868720205354, + "grad_norm": 1.3220758438110352, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 730 + }, + { + "epoch": 0.5427209387605427, + "grad_norm": 0.3635874390602112, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 740 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.3138217628002167, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 750 + }, + { + "epoch": 0.5573890722405573, + "grad_norm": 0.4063207805156708, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.5647231389805647, + "grad_norm": 0.3926219940185547, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 770 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.31954652070999146, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 780 + }, + { + "epoch": 0.5793912724605794, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 790 + }, + { + "epoch": 0.5867253392005867, + "grad_norm": 0.643004834651947, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 800 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.3479592800140381, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 810 + }, + { + "epoch": 0.6013934726806014, + "grad_norm": 0.4684754014015198, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 820 + }, + { + "epoch": 0.6087275394206088, + "grad_norm": 0.3739790916442871, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 830 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.40884748101234436, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 840 + }, + { + "epoch": 0.6233956729006234, + "grad_norm": 0.9722164273262024, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 850 + }, + { + "epoch": 0.6307297396406307, + "grad_norm": 0.42992347478866577, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 860 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.36654195189476013, + "learning_rate": 0.0002, + "loss": 1.1339, + "step": 870 + }, + { + "epoch": 0.6453978731206454, + "grad_norm": 0.4113832116127014, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 880 + }, + { + "epoch": 0.6527319398606527, + "grad_norm": 0.2948838770389557, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 890 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.38330280780792236, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 900 + }, + { + "epoch": 0.6674000733406674, + "grad_norm": 0.4428867697715759, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 910 + }, + { + "epoch": 0.6747341400806748, + "grad_norm": 0.23659265041351318, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 920 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.323685884475708, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 930 + }, + { + "epoch": 0.6894022735606894, + "grad_norm": 0.39157727360725403, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 940 + }, + { + "epoch": 0.6967363403006968, + "grad_norm": 0.27189481258392334, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 950 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.529883861541748, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 960 + }, + { + "epoch": 0.7114044737807114, + "grad_norm": 0.34758689999580383, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 970 + }, + { + "epoch": 0.7187385405207187, + "grad_norm": 0.831749439239502, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 980 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.4438304007053375, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 990 + }, + { + "epoch": 0.7334066740007334, + "grad_norm": 0.33840006589889526, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 1000 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3454797863960266, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1010 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.38999441266059875, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 1020 + }, + { + "epoch": 0.7554088742207554, + "grad_norm": 0.2829911708831787, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1030 + }, + { + "epoch": 0.7627429409607628, + "grad_norm": 0.36918163299560547, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 1040 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3415680229663849, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 1050 + }, + { + "epoch": 0.7774110744407774, + "grad_norm": 0.2974182963371277, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 1060 + }, + { + "epoch": 0.7847451411807848, + "grad_norm": 0.3880919814109802, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 1070 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.33503302931785583, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 1080 + }, + { + "epoch": 0.7994132746607994, + "grad_norm": 0.3728407025337219, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1090 + }, + { + "epoch": 0.8067473414008067, + "grad_norm": 0.3509373664855957, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 1100 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.42228564620018005, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 1110 + }, + { + "epoch": 0.8214154748808215, + "grad_norm": 0.313467800617218, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 1120 + }, + { + "epoch": 0.8287495416208287, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 1130 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.43200382590293884, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 1140 + }, + { + "epoch": 0.8434176751008434, + "grad_norm": 0.3309599459171295, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 1150 + }, + { + "epoch": 0.8507517418408508, + "grad_norm": 0.3526846170425415, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1160 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 1.2722247838974, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 1170 + }, + { + "epoch": 0.8654198753208654, + "grad_norm": 0.34142059087753296, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 1180 + }, + { + "epoch": 0.8727539420608728, + "grad_norm": 0.3805823028087616, + "learning_rate": 0.0002, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.3931232690811157, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 1200 + }, + { + "epoch": 0.8874220755408874, + "grad_norm": 0.2937372624874115, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 1210 + }, + { + "epoch": 0.8947561422808947, + "grad_norm": 0.3757196366786957, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 1220 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.3502705991268158, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 1230 + }, + { + "epoch": 0.9094242757609095, + "grad_norm": 0.32758915424346924, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 1240 + }, + { + "epoch": 0.9167583425009168, + "grad_norm": 0.37199416756629944, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 1250 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.3551490604877472, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 1260 + }, + { + "epoch": 0.9314264759809314, + "grad_norm": 0.2859550714492798, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 1270 + }, + { + "epoch": 0.9387605427209388, + "grad_norm": 0.427990585565567, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 1280 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.33717992901802063, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 1290 + }, + { + "epoch": 0.9534286762009534, + "grad_norm": 0.30225634574890137, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 1300 + }, + { + "epoch": 0.9607627429409608, + "grad_norm": 0.385821133852005, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 1310 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.9754308764209755, + "grad_norm": 0.49987098574638367, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 1330 + }, + { + "epoch": 0.9827649431609827, + "grad_norm": 0.3842747211456299, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 1340 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.6274653673171997, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1350 + }, + { + "epoch": 0.9974330766409975, + "grad_norm": 0.5239808559417725, + "learning_rate": 0.0002, + "loss": 1.124, + "step": 1360 + }, + { + "epoch": 0.9996332966629996, + "eval_loss": 1.1822267770767212, + "eval_runtime": 32.7389, + "eval_samples_per_second": 13.165, + "eval_steps_per_second": 1.649, + "step": 1363 + }, + { + "epoch": 1.0047671433810048, + "grad_norm": 0.45311301946640015, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 1370 + }, + { + "epoch": 1.012101210121012, + "grad_norm": 0.29685574769973755, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1380 + }, + { + "epoch": 1.0194352768610195, + "grad_norm": 0.3290937840938568, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 1390 + }, + { + "epoch": 1.0267693436010268, + "grad_norm": 0.3801758587360382, + "learning_rate": 0.0002, + "loss": 1.0295, + "step": 1400 + }, + { + "epoch": 1.034103410341034, + "grad_norm": 0.794174313545227, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 1410 + }, + { + "epoch": 1.0414374770810415, + "grad_norm": 0.3854154646396637, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 1420 + }, + { + "epoch": 1.0487715438210488, + "grad_norm": 0.32702451944351196, + "learning_rate": 0.0002, + "loss": 1.0652, + "step": 1430 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 0.7815203666687012, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 1440 + }, + { + "epoch": 1.0634396773010635, + "grad_norm": 0.3087436854839325, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1450 + }, + { + "epoch": 1.0707737440410707, + "grad_norm": 0.3847602903842926, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 1460 + }, + { + "epoch": 1.0781078107810782, + "grad_norm": 0.3693031370639801, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1470 + }, + { + "epoch": 1.0854418775210855, + "grad_norm": 0.4111202359199524, + "learning_rate": 0.0002, + "loss": 1.0995, + "step": 1480 + }, + { + "epoch": 1.0927759442610927, + "grad_norm": 0.41452381014823914, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 1490 + }, + { + "epoch": 1.1001100110011002, + "grad_norm": 0.3336445093154907, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 1500 + }, + { + "epoch": 1.1074440777411074, + "grad_norm": 0.3923407793045044, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 1510 + }, + { + "epoch": 1.1147781444811147, + "grad_norm": 0.46215683221817017, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 1520 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.3592156767845154, + "learning_rate": 0.0002, + "loss": 1.1133, + "step": 1530 + }, + { + "epoch": 1.1294462779611294, + "grad_norm": 0.361110657453537, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 1540 + }, + { + "epoch": 1.1367803447011369, + "grad_norm": 0.5317131280899048, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 1550 + }, + { + "epoch": 1.1441144114411441, + "grad_norm": 0.3882388174533844, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 1560 + }, + { + "epoch": 1.1514484781811514, + "grad_norm": 0.3259428143501282, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 1570 + }, + { + "epoch": 1.1587825449211588, + "grad_norm": 0.410935640335083, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 1580 + }, + { + "epoch": 1.166116611661166, + "grad_norm": 0.44940185546875, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 1590 + }, + { + "epoch": 1.1734506784011733, + "grad_norm": 0.5106484293937683, + "learning_rate": 0.0002, + "loss": 1.0334, + "step": 1600 + }, + { + "epoch": 1.1807847451411808, + "grad_norm": 0.6603665947914124, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 1610 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.4799964129924774, + "learning_rate": 0.0002, + "loss": 1.1227, + "step": 1620 + }, + { + "epoch": 1.1954528786211955, + "grad_norm": 0.4389883279800415, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 1630 + }, + { + "epoch": 1.2027869453612028, + "grad_norm": 0.4188813269138336, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 1640 + }, + { + "epoch": 1.21012101210121, + "grad_norm": 0.7132157683372498, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 1650 + }, + { + "epoch": 1.2174550788412175, + "grad_norm": 0.507480263710022, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1660 + }, + { + "epoch": 1.2247891455812248, + "grad_norm": 0.9452332854270935, + "learning_rate": 0.0002, + "loss": 0.9948, + "step": 1670 + }, + { + "epoch": 1.2321232123212322, + "grad_norm": 0.4121614992618561, + "learning_rate": 0.0002, + "loss": 1.0228, + "step": 1680 + }, + { + "epoch": 1.2394572790612395, + "grad_norm": 0.34230247139930725, + "learning_rate": 0.0002, + "loss": 1.0366, + "step": 1690 + }, + { + "epoch": 1.2467913458012467, + "grad_norm": 0.4026208817958832, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 1700 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.46673697233200073, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1710 + }, + { + "epoch": 1.2614594792812615, + "grad_norm": 0.38349825143814087, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 1720 + }, + { + "epoch": 1.2687935460212687, + "grad_norm": 0.4049997627735138, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 1730 + }, + { + "epoch": 1.2761276127612762, + "grad_norm": 0.3417615294456482, + "learning_rate": 0.0002, + "loss": 0.9504, + "step": 1740 + }, + { + "epoch": 1.2834616795012834, + "grad_norm": 0.4277614951133728, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 1750 + }, + { + "epoch": 1.2907957462412907, + "grad_norm": 0.5864202976226807, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 1760 + }, + { + "epoch": 1.2981298129812981, + "grad_norm": 0.7097493410110474, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 1770 + }, + { + "epoch": 1.3054638797213054, + "grad_norm": 0.3145381212234497, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 1780 + }, + { + "epoch": 1.3127979464613129, + "grad_norm": 0.5116165280342102, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 1790 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 0.7469736337661743, + "learning_rate": 0.0002, + "loss": 1.0765, + "step": 1800 + }, + { + "epoch": 1.3274660799413276, + "grad_norm": 0.32272255420684814, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 1810 + }, + { + "epoch": 1.3348001466813348, + "grad_norm": 0.3534623086452484, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 1820 + }, + { + "epoch": 1.342134213421342, + "grad_norm": 0.36127907037734985, + "learning_rate": 0.0002, + "loss": 1.1628, + "step": 1830 + }, + { + "epoch": 1.3494682801613496, + "grad_norm": 0.4072401523590088, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 1840 + }, + { + "epoch": 1.3568023469013568, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 1850 + }, + { + "epoch": 1.364136413641364, + "grad_norm": 0.412883460521698, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 1860 + }, + { + "epoch": 1.3714704803813715, + "grad_norm": 0.3735875189304352, + "learning_rate": 0.0002, + "loss": 1.0265, + "step": 1870 + }, + { + "epoch": 1.3788045471213788, + "grad_norm": 0.39158159494400024, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 1880 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.44431769847869873, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 1890 + }, + { + "epoch": 1.3934726806013935, + "grad_norm": 0.37772801518440247, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 1900 + }, + { + "epoch": 1.4008067473414008, + "grad_norm": 0.4056641757488251, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 1910 + }, + { + "epoch": 1.408140814081408, + "grad_norm": 0.41612377762794495, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 1920 + }, + { + "epoch": 1.4154748808214155, + "grad_norm": 0.41153013706207275, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 1930 + }, + { + "epoch": 1.4228089475614227, + "grad_norm": 0.387845516204834, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1940 + }, + { + "epoch": 1.4301430143014302, + "grad_norm": 0.3809587061405182, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 1950 + }, + { + "epoch": 1.4374770810414375, + "grad_norm": 0.3625726103782654, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1960 + }, + { + "epoch": 1.444811147781445, + "grad_norm": 0.5294290781021118, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1970 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 0.39975494146347046, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 1980 + }, + { + "epoch": 1.4594792812614594, + "grad_norm": 0.4181167185306549, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 1990 + }, + { + "epoch": 1.466813348001467, + "grad_norm": 0.42001503705978394, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 2000 + }, + { + "epoch": 1.4741474147414741, + "grad_norm": 0.4877578616142273, + "learning_rate": 0.0002, + "loss": 1.1266, + "step": 2010 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.4050969183444977, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 2020 + }, + { + "epoch": 1.4888155482214889, + "grad_norm": 0.39068883657455444, + "learning_rate": 0.0002, + "loss": 1.0562, + "step": 2030 + }, + { + "epoch": 1.4961496149614961, + "grad_norm": 0.421282559633255, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 2040 + }, + { + "epoch": 1.5034836817015034, + "grad_norm": 0.47092297673225403, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 2050 + }, + { + "epoch": 1.5108177484415108, + "grad_norm": 0.39688974618911743, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 2060 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.5529879331588745, + "learning_rate": 0.0002, + "loss": 1.08, + "step": 2070 + }, + { + "epoch": 1.5254858819215253, + "grad_norm": 0.4879782199859619, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 2080 + }, + { + "epoch": 1.5328199486615328, + "grad_norm": 0.5517361164093018, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 2090 + }, + { + "epoch": 1.5401540154015403, + "grad_norm": 0.44015637040138245, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2100 + }, + { + "epoch": 1.5474880821415475, + "grad_norm": 0.5435167551040649, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 2110 + }, + { + "epoch": 1.5548221488815548, + "grad_norm": 0.5714033246040344, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 2120 + }, + { + "epoch": 1.5621562156215623, + "grad_norm": 0.31732529401779175, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 2130 + }, + { + "epoch": 1.5694902823615695, + "grad_norm": 0.49068278074264526, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 2140 + }, + { + "epoch": 1.5768243491015768, + "grad_norm": 0.46851542592048645, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 2150 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.5083092451095581, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 2160 + }, + { + "epoch": 1.5914924825815915, + "grad_norm": 0.9822936058044434, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 2170 + }, + { + "epoch": 1.5988265493215987, + "grad_norm": 0.4575989246368408, + "learning_rate": 0.0002, + "loss": 0.9986, + "step": 2180 + }, + { + "epoch": 1.6061606160616062, + "grad_norm": 0.47444286942481995, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 2190 + }, + { + "epoch": 1.6134946828016135, + "grad_norm": 0.7208226919174194, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 2200 + }, + { + "epoch": 1.6208287495416207, + "grad_norm": 0.43791481852531433, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 2210 + }, + { + "epoch": 1.6281628162816282, + "grad_norm": 0.5245792865753174, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 2220 + }, + { + "epoch": 1.6354968830216357, + "grad_norm": 0.39289429783821106, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 2230 + }, + { + "epoch": 1.6428309497616427, + "grad_norm": 0.6106135845184326, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 2240 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.3722580671310425, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 2250 + }, + { + "epoch": 1.6574990832416576, + "grad_norm": 0.3649403750896454, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2260 + }, + { + "epoch": 1.6648331499816649, + "grad_norm": 0.46514248847961426, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 2270 + }, + { + "epoch": 1.6721672167216721, + "grad_norm": 0.42034927010536194, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 2280 + }, + { + "epoch": 1.6795012834616796, + "grad_norm": 0.45202910900115967, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 2290 + }, + { + "epoch": 1.6868353502016868, + "grad_norm": 0.36257603764533997, + "learning_rate": 0.0002, + "loss": 1.0866, + "step": 2300 + }, + { + "epoch": 1.694169416941694, + "grad_norm": 0.6340323090553284, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 2310 + }, + { + "epoch": 1.7015034836817016, + "grad_norm": 0.4352878928184509, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 2320 + }, + { + "epoch": 1.7088375504217088, + "grad_norm": 0.45029792189598083, + "learning_rate": 0.0002, + "loss": 1.0629, + "step": 2330 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.3891315758228302, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 2340 + }, + { + "epoch": 1.7235056839017235, + "grad_norm": 0.35180050134658813, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2350 + }, + { + "epoch": 1.7308397506417308, + "grad_norm": 0.42367449402809143, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 2360 + }, + { + "epoch": 1.738173817381738, + "grad_norm": 0.4553675353527069, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 2370 + }, + { + "epoch": 1.7455078841217455, + "grad_norm": 0.5944654941558838, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 2380 + }, + { + "epoch": 1.752841950861753, + "grad_norm": 0.3479664623737335, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 2390 + }, + { + "epoch": 1.76017601760176, + "grad_norm": 0.3585502505302429, + "learning_rate": 0.0002, + "loss": 1.0798, + "step": 2400 + }, + { + "epoch": 1.7675100843417675, + "grad_norm": 0.4263346493244171, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 2410 + }, + { + "epoch": 1.774844151081775, + "grad_norm": 0.5476409196853638, + "learning_rate": 0.0002, + "loss": 1.054, + "step": 2420 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 0.3694186508655548, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 2430 + }, + { + "epoch": 1.7895122845617895, + "grad_norm": 0.9185658693313599, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 2440 + }, + { + "epoch": 1.796846351301797, + "grad_norm": 0.7171908020973206, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2450 + }, + { + "epoch": 1.8041804180418042, + "grad_norm": 0.550658643245697, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 2460 + }, + { + "epoch": 1.8115144847818114, + "grad_norm": 0.4075568914413452, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2470 + }, + { + "epoch": 1.818848551521819, + "grad_norm": 0.3790127635002136, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 2480 + }, + { + "epoch": 1.8261826182618262, + "grad_norm": 0.3576384484767914, + "learning_rate": 0.0002, + "loss": 0.9839, + "step": 2490 + }, + { + "epoch": 1.8335166850018334, + "grad_norm": 0.3919370770454407, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 2500 + }, + { + "epoch": 1.8408507517418409, + "grad_norm": 0.485083669424057, + "learning_rate": 0.0002, + "loss": 0.9985, + "step": 2510 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.4564347565174103, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 2520 + }, + { + "epoch": 1.8555188852218554, + "grad_norm": 0.3613106608390808, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 2530 + }, + { + "epoch": 1.8628529519618628, + "grad_norm": 0.39600759744644165, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 2540 + }, + { + "epoch": 1.8701870187018703, + "grad_norm": 1.123499870300293, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 2550 + }, + { + "epoch": 1.8775210854418776, + "grad_norm": 0.4612680673599243, + "learning_rate": 0.0002, + "loss": 1.0635, + "step": 2560 + }, + { + "epoch": 1.8848551521818848, + "grad_norm": 0.42745399475097656, + "learning_rate": 0.0002, + "loss": 1.0087, + "step": 2570 + }, + { + "epoch": 1.8921892189218923, + "grad_norm": 0.4055580198764801, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 2580 + }, + { + "epoch": 1.8995232856618995, + "grad_norm": 0.44174644351005554, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 2590 + }, + { + "epoch": 1.9068573524019068, + "grad_norm": 1.0228385925292969, + "learning_rate": 0.0002, + "loss": 0.9886, + "step": 2600 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.3496396243572235, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 2610 + }, + { + "epoch": 1.9215254858819215, + "grad_norm": 0.4191173017024994, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2620 + }, + { + "epoch": 1.9288595526219288, + "grad_norm": 0.6778554916381836, + "learning_rate": 0.0002, + "loss": 1.0943, + "step": 2630 + }, + { + "epoch": 1.9361936193619362, + "grad_norm": 0.41992834210395813, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 2640 + }, + { + "epoch": 1.9435276861019435, + "grad_norm": 0.8760401010513306, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 2650 + }, + { + "epoch": 1.9508617528419507, + "grad_norm": 0.44049209356307983, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 2660 + }, + { + "epoch": 1.9581958195819582, + "grad_norm": 0.5651928782463074, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 2670 + }, + { + "epoch": 1.9655298863219657, + "grad_norm": 0.5292727947235107, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 2680 + }, + { + "epoch": 1.9728639530619727, + "grad_norm": 0.6012240648269653, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 2690 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.3945149779319763, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 2700 + }, + { + "epoch": 1.9875320865419877, + "grad_norm": 0.5732627511024475, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 2710 + }, + { + "epoch": 1.994866153281995, + "grad_norm": 0.3963361084461212, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 2720 + }, + { + "epoch": 2.0, + "eval_loss": 1.1534006595611572, + "eval_runtime": 32.7541, + "eval_samples_per_second": 13.159, + "eval_steps_per_second": 1.649, + "step": 2727 + }, + { + "epoch": 2.002200220022002, + "grad_norm": 0.48628315329551697, + "learning_rate": 0.0002, + "loss": 0.9624, + "step": 2730 + }, + { + "epoch": 2.0095342867620096, + "grad_norm": 0.413875013589859, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 2740 + }, + { + "epoch": 2.0168683535020167, + "grad_norm": 0.4988735616207123, + "learning_rate": 0.0002, + "loss": 0.965, + "step": 2750 + }, + { + "epoch": 2.024202420242024, + "grad_norm": 0.5634812712669373, + "learning_rate": 0.0002, + "loss": 0.9677, + "step": 2760 + }, + { + "epoch": 2.0315364869820316, + "grad_norm": 0.48302653431892395, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 2770 + }, + { + "epoch": 2.038870553722039, + "grad_norm": 0.49914175271987915, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 2780 + }, + { + "epoch": 2.046204620462046, + "grad_norm": 1.14039945602417, + "learning_rate": 0.0002, + "loss": 0.904, + "step": 2790 + }, + { + "epoch": 2.0535386872020536, + "grad_norm": 0.6359720826148987, + "learning_rate": 0.0002, + "loss": 0.9588, + "step": 2800 + }, + { + "epoch": 2.060872753942061, + "grad_norm": 0.4589158296585083, + "learning_rate": 0.0002, + "loss": 0.9031, + "step": 2810 + }, + { + "epoch": 2.068206820682068, + "grad_norm": 0.46255481243133545, + "learning_rate": 0.0002, + "loss": 0.9438, + "step": 2820 + }, + { + "epoch": 2.0755408874220755, + "grad_norm": 0.6232137680053711, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 2830 + }, + { + "epoch": 2.082874954162083, + "grad_norm": 0.41042178869247437, + "learning_rate": 0.0002, + "loss": 0.8978, + "step": 2840 + }, + { + "epoch": 2.09020902090209, + "grad_norm": 0.5334428548812866, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 2850 + }, + { + "epoch": 2.0975430876420975, + "grad_norm": 0.8270058631896973, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 2860 + }, + { + "epoch": 2.104877154382105, + "grad_norm": 0.6624533534049988, + "learning_rate": 0.0002, + "loss": 1.0064, + "step": 2870 + }, + { + "epoch": 2.112211221122112, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0002, + "loss": 0.9196, + "step": 2880 + }, + { + "epoch": 2.1195452878621195, + "grad_norm": 0.621482789516449, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 2890 + }, + { + "epoch": 2.126879354602127, + "grad_norm": 0.4556255340576172, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2900 + }, + { + "epoch": 2.1342134213421344, + "grad_norm": 0.4620579183101654, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 2910 + }, + { + "epoch": 2.1415474880821415, + "grad_norm": 0.9602415561676025, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2920 + }, + { + "epoch": 2.148881554822149, + "grad_norm": 0.587943971157074, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 2930 + }, + { + "epoch": 2.1562156215621564, + "grad_norm": 0.5121372938156128, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 2940 + }, + { + "epoch": 2.1635496883021634, + "grad_norm": 0.49424484372138977, + "learning_rate": 0.0002, + "loss": 0.8751, + "step": 2950 + }, + { + "epoch": 2.170883755042171, + "grad_norm": 0.6312560439109802, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2960 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.5235576629638672, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2970 + }, + { + "epoch": 2.1855518885221854, + "grad_norm": 0.5868439674377441, + "learning_rate": 0.0002, + "loss": 0.9706, + "step": 2980 + }, + { + "epoch": 2.192885955262193, + "grad_norm": 0.42302873730659485, + "learning_rate": 0.0002, + "loss": 0.9338, + "step": 2990 + }, + { + "epoch": 2.2002200220022003, + "grad_norm": 0.5097725987434387, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 3000 + }, + { + "epoch": 2.2075540887422074, + "grad_norm": 0.5091572403907776, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 3010 + }, + { + "epoch": 2.214888155482215, + "grad_norm": 0.49433162808418274, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 3020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5577368140220642, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 3030 + }, + { + "epoch": 2.2295562889622293, + "grad_norm": 0.6177583932876587, + "learning_rate": 0.0002, + "loss": 0.9033, + "step": 3040 + }, + { + "epoch": 2.236890355702237, + "grad_norm": 0.5256719589233398, + "learning_rate": 0.0002, + "loss": 0.9882, + "step": 3050 + }, + { + "epoch": 2.2442244224422443, + "grad_norm": 0.5001118183135986, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3060 + }, + { + "epoch": 2.2515584891822513, + "grad_norm": 0.5721249580383301, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3070 + }, + { + "epoch": 2.258892555922259, + "grad_norm": 0.5325384140014648, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 3080 + }, + { + "epoch": 2.2662266226622663, + "grad_norm": 0.5719189047813416, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 3090 + }, + { + "epoch": 2.2735606894022737, + "grad_norm": 0.6337835788726807, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 3100 + }, + { + "epoch": 2.2808947561422808, + "grad_norm": 0.5381836891174316, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 3110 + }, + { + "epoch": 2.2882288228822882, + "grad_norm": 0.5408531427383423, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 3120 + }, + { + "epoch": 2.2955628896222957, + "grad_norm": 0.43705281615257263, + "learning_rate": 0.0002, + "loss": 1.0325, + "step": 3130 + }, + { + "epoch": 2.3028969563623027, + "grad_norm": 0.6454030275344849, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 3140 + }, + { + "epoch": 2.31023102310231, + "grad_norm": 0.686030387878418, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 3150 + }, + { + "epoch": 2.3175650898423177, + "grad_norm": 0.5123633146286011, + "learning_rate": 0.0002, + "loss": 0.9403, + "step": 3160 + }, + { + "epoch": 2.3248991565823247, + "grad_norm": 0.842506468296051, + "learning_rate": 0.0002, + "loss": 0.8834, + "step": 3170 + }, + { + "epoch": 2.332233223322332, + "grad_norm": 0.5193818807601929, + "learning_rate": 0.0002, + "loss": 1.0497, + "step": 3180 + }, + { + "epoch": 2.3395672900623397, + "grad_norm": 0.5634409189224243, + "learning_rate": 0.0002, + "loss": 0.9473, + "step": 3190 + }, + { + "epoch": 2.3469013568023467, + "grad_norm": 0.6475534439086914, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 3200 + }, + { + "epoch": 2.354235423542354, + "grad_norm": 1.1503914594650269, + "learning_rate": 0.0002, + "loss": 0.874, + "step": 3210 + }, + { + "epoch": 2.3615694902823616, + "grad_norm": 0.7234905362129211, + "learning_rate": 0.0002, + "loss": 0.9762, + "step": 3220 + }, + { + "epoch": 2.368903557022369, + "grad_norm": 0.664903461933136, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 3230 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.5453006625175476, + "learning_rate": 0.0002, + "loss": 0.9987, + "step": 3240 + }, + { + "epoch": 2.3835716905023836, + "grad_norm": 0.6256654262542725, + "learning_rate": 0.0002, + "loss": 0.9742, + "step": 3250 + }, + { + "epoch": 2.390905757242391, + "grad_norm": 0.5166565179824829, + "learning_rate": 0.0002, + "loss": 0.9922, + "step": 3260 + }, + { + "epoch": 2.398239823982398, + "grad_norm": 0.5699098110198975, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 3270 + }, + { + "epoch": 2.4055738907224056, + "grad_norm": 0.4472540020942688, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 3280 + }, + { + "epoch": 2.412907957462413, + "grad_norm": 0.6790403127670288, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3290 + }, + { + "epoch": 2.42024202420242, + "grad_norm": 0.5182185173034668, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 3300 + }, + { + "epoch": 2.4275760909424275, + "grad_norm": 0.564647912979126, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 3310 + }, + { + "epoch": 2.434910157682435, + "grad_norm": 0.5625313520431519, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 3320 + }, + { + "epoch": 2.442244224422442, + "grad_norm": 0.7496559619903564, + "learning_rate": 0.0002, + "loss": 0.8798, + "step": 3330 + }, + { + "epoch": 2.4495782911624495, + "grad_norm": 0.4779128134250641, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 2.456912357902457, + "grad_norm": 0.578093409538269, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 3350 + }, + { + "epoch": 2.4642464246424645, + "grad_norm": 0.5456080436706543, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 3360 + }, + { + "epoch": 2.4715804913824715, + "grad_norm": 0.4769273102283478, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 3370 + }, + { + "epoch": 2.478914558122479, + "grad_norm": 0.5608189702033997, + "learning_rate": 0.0002, + "loss": 0.9312, + "step": 3380 + }, + { + "epoch": 2.4862486248624864, + "grad_norm": 0.5590165853500366, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 3390 + }, + { + "epoch": 2.4935826916024935, + "grad_norm": 0.801306962966919, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 3400 + }, + { + "epoch": 2.500916758342501, + "grad_norm": 0.6045624613761902, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 3410 + }, + { + "epoch": 2.5082508250825084, + "grad_norm": 0.5735858082771301, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 3420 + }, + { + "epoch": 2.5155848918225154, + "grad_norm": 0.6827309131622314, + "learning_rate": 0.0002, + "loss": 0.9846, + "step": 3430 + }, + { + "epoch": 2.522918958562523, + "grad_norm": 0.5702602863311768, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 3440 + }, + { + "epoch": 2.5302530253025304, + "grad_norm": 0.6674721240997314, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 3450 + }, + { + "epoch": 2.5375870920425374, + "grad_norm": 0.5635907649993896, + "learning_rate": 0.0002, + "loss": 0.914, + "step": 3460 + }, + { + "epoch": 2.544921158782545, + "grad_norm": 0.42737770080566406, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 3470 + }, + { + "epoch": 2.5522552255225524, + "grad_norm": 0.6720691919326782, + "learning_rate": 0.0002, + "loss": 0.9474, + "step": 3480 + }, + { + "epoch": 2.55958929226256, + "grad_norm": 0.8917084336280823, + "learning_rate": 0.0002, + "loss": 0.8637, + "step": 3490 + }, + { + "epoch": 2.566923359002567, + "grad_norm": 0.5134549140930176, + "learning_rate": 0.0002, + "loss": 0.9257, + "step": 3500 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.4951367974281311, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 3510 + }, + { + "epoch": 2.5815914924825814, + "grad_norm": 0.9438204765319824, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 3520 + }, + { + "epoch": 2.588925559222589, + "grad_norm": 0.6024714708328247, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 3530 + }, + { + "epoch": 2.5962596259625963, + "grad_norm": 0.5248535871505737, + "learning_rate": 0.0002, + "loss": 0.9298, + "step": 3540 + }, + { + "epoch": 2.6035936927026038, + "grad_norm": 0.8677568435668945, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 3550 + }, + { + "epoch": 2.610927759442611, + "grad_norm": 0.82008296251297, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 3560 + }, + { + "epoch": 2.6182618261826183, + "grad_norm": 0.4724634885787964, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 3570 + }, + { + "epoch": 2.6255958929226257, + "grad_norm": 0.5434244275093079, + "learning_rate": 0.0002, + "loss": 0.9058, + "step": 3580 + }, + { + "epoch": 2.6329299596626328, + "grad_norm": 0.4948740005493164, + "learning_rate": 0.0002, + "loss": 0.9379, + "step": 3590 + }, + { + "epoch": 2.6402640264026402, + "grad_norm": 0.42109328508377075, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3600 + }, + { + "epoch": 2.6475980931426477, + "grad_norm": 0.7979786396026611, + "learning_rate": 0.0002, + "loss": 0.9809, + "step": 3610 + }, + { + "epoch": 2.654932159882655, + "grad_norm": 0.6345919370651245, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 3620 + }, + { + "epoch": 2.662266226622662, + "grad_norm": 0.4971671402454376, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 3630 + }, + { + "epoch": 2.6696002933626697, + "grad_norm": 0.6467748284339905, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 3640 + }, + { + "epoch": 2.6769343601026767, + "grad_norm": 0.4240160286426544, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 3650 + }, + { + "epoch": 2.684268426842684, + "grad_norm": 0.5179754495620728, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3660 + }, + { + "epoch": 2.6916024935826917, + "grad_norm": 0.754012405872345, + "learning_rate": 0.0002, + "loss": 0.9221, + "step": 3670 + }, + { + "epoch": 2.698936560322699, + "grad_norm": 0.5141299962997437, + "learning_rate": 0.0002, + "loss": 0.9194, + "step": 3680 + }, + { + "epoch": 2.706270627062706, + "grad_norm": 0.5737819075584412, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 3690 + }, + { + "epoch": 2.7136046938027136, + "grad_norm": 0.5887577533721924, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 3700 + }, + { + "epoch": 2.720938760542721, + "grad_norm": 0.6740471720695496, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 3710 + }, + { + "epoch": 2.728272827282728, + "grad_norm": 0.5879453420639038, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 3720 + }, + { + "epoch": 2.7356068940227356, + "grad_norm": 0.4858354926109314, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 3730 + }, + { + "epoch": 2.742940960762743, + "grad_norm": 0.5489001870155334, + "learning_rate": 0.0002, + "loss": 0.9308, + "step": 3740 + }, + { + "epoch": 2.7502750275027505, + "grad_norm": 0.8187092542648315, + "learning_rate": 0.0002, + "loss": 0.894, + "step": 3750 + }, + { + "epoch": 2.7576090942427576, + "grad_norm": 0.5666626691818237, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 3760 + }, + { + "epoch": 2.764943160982765, + "grad_norm": 0.5377066135406494, + "learning_rate": 0.0002, + "loss": 1.0059, + "step": 3770 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.566330075263977, + "learning_rate": 0.0002, + "loss": 0.9132, + "step": 3780 + }, + { + "epoch": 2.7796112944627795, + "grad_norm": 0.5522832870483398, + "learning_rate": 0.0002, + "loss": 0.9415, + "step": 3790 + }, + { + "epoch": 2.786945361202787, + "grad_norm": 0.5668695569038391, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 3800 + }, + { + "epoch": 2.7942794279427945, + "grad_norm": 0.7566602826118469, + "learning_rate": 0.0002, + "loss": 0.8885, + "step": 3810 + }, + { + "epoch": 2.8016134946828015, + "grad_norm": 0.5603684782981873, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 3820 + }, + { + "epoch": 2.808947561422809, + "grad_norm": 0.49122217297554016, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 3830 + }, + { + "epoch": 2.816281628162816, + "grad_norm": 0.6798251867294312, + "learning_rate": 0.0002, + "loss": 0.9738, + "step": 3840 + }, + { + "epoch": 2.8236156949028235, + "grad_norm": 0.6097991466522217, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 3850 + }, + { + "epoch": 2.830949761642831, + "grad_norm": 0.6675726175308228, + "learning_rate": 0.0002, + "loss": 0.8672, + "step": 3860 + }, + { + "epoch": 2.8382838283828384, + "grad_norm": 0.9223952889442444, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 3870 + }, + { + "epoch": 2.8456178951228455, + "grad_norm": 0.6020799875259399, + "learning_rate": 0.0002, + "loss": 0.8767, + "step": 3880 + }, + { + "epoch": 2.852951961862853, + "grad_norm": 0.5206381678581238, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3890 + }, + { + "epoch": 2.8602860286028604, + "grad_norm": 0.6268777251243591, + "learning_rate": 0.0002, + "loss": 0.9479, + "step": 3900 + }, + { + "epoch": 2.8676200953428674, + "grad_norm": 1.1583497524261475, + "learning_rate": 0.0002, + "loss": 0.9409, + "step": 3910 + }, + { + "epoch": 2.874954162082875, + "grad_norm": 0.7263903021812439, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 3920 + }, + { + "epoch": 2.8822882288228824, + "grad_norm": 0.5369910001754761, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 3930 + }, + { + "epoch": 2.88962229556289, + "grad_norm": 0.7298350930213928, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 3940 + }, + { + "epoch": 2.896956362302897, + "grad_norm": 0.577012836933136, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 3950 + }, + { + "epoch": 2.9042904290429044, + "grad_norm": 0.5859594345092773, + "learning_rate": 0.0002, + "loss": 0.9716, + "step": 3960 + }, + { + "epoch": 2.9116244957829114, + "grad_norm": 0.47176122665405273, + "learning_rate": 0.0002, + "loss": 0.8772, + "step": 3970 + }, + { + "epoch": 2.918958562522919, + "grad_norm": 0.9699620604515076, + "learning_rate": 0.0002, + "loss": 0.8997, + "step": 3980 + }, + { + "epoch": 2.9262926292629263, + "grad_norm": 0.7908747792243958, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3990 + }, + { + "epoch": 2.933626696002934, + "grad_norm": 0.5777379274368286, + "learning_rate": 0.0002, + "loss": 0.9462, + "step": 4000 + }, + { + "epoch": 2.940960762742941, + "grad_norm": 0.599288284778595, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 4010 + }, + { + "epoch": 2.9482948294829483, + "grad_norm": 0.5232274532318115, + "learning_rate": 0.0002, + "loss": 0.9812, + "step": 4020 + }, + { + "epoch": 2.9556288962229558, + "grad_norm": 0.6395137310028076, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 4030 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.589260458946228, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 4040 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.5699581503868103, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 4050 + }, + { + "epoch": 2.9776310964429777, + "grad_norm": 0.528468132019043, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 4060 + }, + { + "epoch": 2.984965163182985, + "grad_norm": 0.4804670512676239, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 4070 + }, + { + "epoch": 2.9922992299229922, + "grad_norm": 1.1918889284133911, + "learning_rate": 0.0002, + "loss": 0.9771, + "step": 4080 + }, + { + "epoch": 2.9996332966629997, + "grad_norm": 0.5479103326797485, + "learning_rate": 0.0002, + "loss": 0.9178, + "step": 4090 + }, + { + "epoch": 2.9996332966629997, + "eval_loss": 1.1642853021621704, + "eval_runtime": 32.7511, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.649, + "step": 4090 + }, + { + "epoch": 3.006967363403007, + "grad_norm": 0.7430027723312378, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 4100 + }, + { + "epoch": 3.014301430143014, + "grad_norm": 0.6293647289276123, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4110 + }, + { + "epoch": 3.0216354968830217, + "grad_norm": 0.6191329956054688, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 4120 + }, + { + "epoch": 3.028969563623029, + "grad_norm": 0.7959313988685608, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4130 + }, + { + "epoch": 3.036303630363036, + "grad_norm": 0.5956351161003113, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 4140 + }, + { + "epoch": 3.0436376971030437, + "grad_norm": 0.670383632183075, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 4150 + }, + { + "epoch": 3.050971763843051, + "grad_norm": 0.6414518356323242, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 4160 + }, + { + "epoch": 3.058305830583058, + "grad_norm": 0.7928852438926697, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 4170 + }, + { + "epoch": 3.0656398973230656, + "grad_norm": 0.6211121082305908, + "learning_rate": 0.0002, + "loss": 0.7914, + "step": 4180 + }, + { + "epoch": 3.072973964063073, + "grad_norm": 0.6237057447433472, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 4190 + }, + { + "epoch": 3.08030803080308, + "grad_norm": 0.6522233486175537, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 4200 + }, + { + "epoch": 3.0876420975430876, + "grad_norm": 0.9396848678588867, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4210 + }, + { + "epoch": 3.094976164283095, + "grad_norm": 0.8003010749816895, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 4220 + }, + { + "epoch": 3.102310231023102, + "grad_norm": 0.6733810305595398, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 4230 + }, + { + "epoch": 3.1096442977631096, + "grad_norm": 0.6365828514099121, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 4240 + }, + { + "epoch": 3.116978364503117, + "grad_norm": 1.0805548429489136, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4250 + }, + { + "epoch": 3.1243124312431245, + "grad_norm": 0.7262141108512878, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4260 + }, + { + "epoch": 3.1316464979831315, + "grad_norm": 0.5500539541244507, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 4270 + }, + { + "epoch": 3.138980564723139, + "grad_norm": 0.793912947177887, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 4280 + }, + { + "epoch": 3.1463146314631465, + "grad_norm": 1.2540518045425415, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 4290 + }, + { + "epoch": 3.1536486982031535, + "grad_norm": 0.7020077705383301, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 4300 + }, + { + "epoch": 3.160982764943161, + "grad_norm": 0.5111123323440552, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 4310 + }, + { + "epoch": 3.1683168316831685, + "grad_norm": 0.7172090411186218, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 4320 + }, + { + "epoch": 3.1756508984231755, + "grad_norm": 0.6343168616294861, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 4330 + }, + { + "epoch": 3.182984965163183, + "grad_norm": 0.9563672542572021, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4340 + }, + { + "epoch": 3.1903190319031904, + "grad_norm": 1.0225574970245361, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4350 + }, + { + "epoch": 3.1976530986431975, + "grad_norm": 1.1633386611938477, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 4360 + }, + { + "epoch": 3.204987165383205, + "grad_norm": 0.8915148973464966, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 4370 + }, + { + "epoch": 3.2123212321232124, + "grad_norm": 0.9156812429428101, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4380 + }, + { + "epoch": 3.21965529886322, + "grad_norm": 0.6363258957862854, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 4390 + }, + { + "epoch": 3.226989365603227, + "grad_norm": 0.579099178314209, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 4400 + }, + { + "epoch": 3.2343234323432344, + "grad_norm": 0.8778146505355835, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 4410 + }, + { + "epoch": 3.241657499083242, + "grad_norm": 0.8356770873069763, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 4420 + }, + { + "epoch": 3.248991565823249, + "grad_norm": 0.702032208442688, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 4430 + }, + { + "epoch": 3.2563256325632564, + "grad_norm": 0.6386539340019226, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 4440 + }, + { + "epoch": 3.263659699303264, + "grad_norm": 0.7008408904075623, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 4450 + }, + { + "epoch": 3.270993766043271, + "grad_norm": 0.9556332230567932, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 4460 + }, + { + "epoch": 3.2783278327832783, + "grad_norm": 0.5667835474014282, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 4470 + }, + { + "epoch": 3.285661899523286, + "grad_norm": 0.8239172697067261, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 4480 + }, + { + "epoch": 3.292995966263293, + "grad_norm": 0.7045050859451294, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 4490 + }, + { + "epoch": 3.3003300330033003, + "grad_norm": 0.7131434082984924, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 4500 + }, + { + "epoch": 3.3076640997433078, + "grad_norm": 0.6924910545349121, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 4510 + }, + { + "epoch": 3.3149981664833152, + "grad_norm": 0.8945356607437134, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 4520 + }, + { + "epoch": 3.3223322332233223, + "grad_norm": 0.6546903252601624, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 4530 + }, + { + "epoch": 3.3296662999633297, + "grad_norm": 0.8206679224967957, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4540 + }, + { + "epoch": 3.3370003667033368, + "grad_norm": 0.6482203602790833, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 4550 + }, + { + "epoch": 3.3443344334433442, + "grad_norm": 0.7558760046958923, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 4560 + }, + { + "epoch": 3.3516685001833517, + "grad_norm": 0.7794756889343262, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 4570 + }, + { + "epoch": 3.359002566923359, + "grad_norm": 0.7382805943489075, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4580 + }, + { + "epoch": 3.366336633663366, + "grad_norm": 0.5912511944770813, + "learning_rate": 0.0002, + "loss": 0.8511, + "step": 4590 + }, + { + "epoch": 3.3736707004033737, + "grad_norm": 0.7444885969161987, + "learning_rate": 0.0002, + "loss": 0.8272, + "step": 4600 + }, + { + "epoch": 3.381004767143381, + "grad_norm": 0.7354922890663147, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 4610 + }, + { + "epoch": 3.388338833883388, + "grad_norm": 0.7685934901237488, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 4620 + }, + { + "epoch": 3.3956729006233957, + "grad_norm": 0.61041259765625, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 4630 + }, + { + "epoch": 3.403006967363403, + "grad_norm": 0.6820451021194458, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 4640 + }, + { + "epoch": 3.41034103410341, + "grad_norm": 0.5819534063339233, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 4650 + }, + { + "epoch": 3.4176751008434176, + "grad_norm": 0.705410897731781, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 4660 + }, + { + "epoch": 3.425009167583425, + "grad_norm": 0.8052892088890076, + "learning_rate": 0.0002, + "loss": 0.7901, + "step": 4670 + }, + { + "epoch": 3.432343234323432, + "grad_norm": 0.7746483087539673, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 4680 + }, + { + "epoch": 3.4396773010634396, + "grad_norm": 0.7713689804077148, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 4690 + }, + { + "epoch": 3.447011367803447, + "grad_norm": 0.810371994972229, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 4700 + }, + { + "epoch": 3.4543454345434546, + "grad_norm": 0.7702969312667847, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4710 + }, + { + "epoch": 3.4616795012834616, + "grad_norm": 0.7069268822669983, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4720 + }, + { + "epoch": 3.469013568023469, + "grad_norm": 0.7640359401702881, + "learning_rate": 0.0002, + "loss": 0.8199, + "step": 4730 + }, + { + "epoch": 3.4763476347634765, + "grad_norm": 0.8661707639694214, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 4740 + }, + { + "epoch": 3.4836817015034836, + "grad_norm": 0.9970282912254333, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 4750 + }, + { + "epoch": 3.491015768243491, + "grad_norm": 0.5824355483055115, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 4760 + }, + { + "epoch": 3.4983498349834985, + "grad_norm": 1.3072649240493774, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 4770 + }, + { + "epoch": 3.5056839017235055, + "grad_norm": 0.873978316783905, + "learning_rate": 0.0002, + "loss": 0.9101, + "step": 4780 + }, + { + "epoch": 3.513017968463513, + "grad_norm": 0.5526657104492188, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4790 + }, + { + "epoch": 3.5203520352035205, + "grad_norm": 0.790894627571106, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 4800 + }, + { + "epoch": 3.5276861019435275, + "grad_norm": 0.8119630217552185, + "learning_rate": 0.0002, + "loss": 0.831, + "step": 4810 + }, + { + "epoch": 3.535020168683535, + "grad_norm": 0.633212149143219, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 4820 + }, + { + "epoch": 3.5423542354235424, + "grad_norm": 0.703029990196228, + "learning_rate": 0.0002, + "loss": 0.8505, + "step": 4830 + }, + { + "epoch": 3.54968830216355, + "grad_norm": 0.7603771686553955, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 4840 + }, + { + "epoch": 3.557022368903557, + "grad_norm": 0.6260480880737305, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 4850 + }, + { + "epoch": 3.5643564356435644, + "grad_norm": 0.8203664422035217, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 4860 + }, + { + "epoch": 3.5716905023835714, + "grad_norm": 0.7793813347816467, + "learning_rate": 0.0002, + "loss": 0.8821, + "step": 4870 + }, + { + "epoch": 3.579024569123579, + "grad_norm": 0.7667397260665894, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 4880 + }, + { + "epoch": 3.5863586358635864, + "grad_norm": 0.8198829889297485, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 4890 + }, + { + "epoch": 3.593692702603594, + "grad_norm": 0.7689233422279358, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 4900 + }, + { + "epoch": 3.601026769343601, + "grad_norm": 0.7870983481407166, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 4910 + }, + { + "epoch": 3.6083608360836084, + "grad_norm": 0.8133853077888489, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 4920 + }, + { + "epoch": 3.615694902823616, + "grad_norm": 1.308401346206665, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 4930 + }, + { + "epoch": 3.623028969563623, + "grad_norm": 0.7131121754646301, + "learning_rate": 0.0002, + "loss": 0.8494, + "step": 4940 + }, + { + "epoch": 3.6303630363036303, + "grad_norm": 0.6825910210609436, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 4950 + }, + { + "epoch": 3.637697103043638, + "grad_norm": 0.7254678606987, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4960 + }, + { + "epoch": 3.6450311697836453, + "grad_norm": 0.8045085072517395, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4970 + }, + { + "epoch": 3.6523652365236523, + "grad_norm": 0.6991777420043945, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 4980 + }, + { + "epoch": 3.6596993032636598, + "grad_norm": 0.7804713249206543, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 4990 + }, + { + "epoch": 3.667033370003667, + "grad_norm": 0.8525708317756653, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 5000 + }, + { + "epoch": 3.6743674367436743, + "grad_norm": 0.7959994673728943, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 5010 + }, + { + "epoch": 3.6817015034836817, + "grad_norm": 0.8103628158569336, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5020 + }, + { + "epoch": 3.689035570223689, + "grad_norm": 0.7517836093902588, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 5030 + }, + { + "epoch": 3.6963696369636962, + "grad_norm": 0.6878514289855957, + "learning_rate": 0.0002, + "loss": 0.8375, + "step": 5040 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 1.2371820211410522, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 5050 + }, + { + "epoch": 3.711037770443711, + "grad_norm": 0.6567103862762451, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 5060 + }, + { + "epoch": 3.718371837183718, + "grad_norm": 1.1254922151565552, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 5070 + }, + { + "epoch": 3.7257059039237257, + "grad_norm": 0.6796132326126099, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 5080 + }, + { + "epoch": 3.733039970663733, + "grad_norm": 0.7285300493240356, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5090 + }, + { + "epoch": 3.7403740374037406, + "grad_norm": 0.8931500911712646, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 5100 + }, + { + "epoch": 3.7477081041437477, + "grad_norm": 0.6256856918334961, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 5110 + }, + { + "epoch": 3.755042170883755, + "grad_norm": 0.79310142993927, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5120 + }, + { + "epoch": 3.762376237623762, + "grad_norm": 0.6594041585922241, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 5130 + }, + { + "epoch": 3.7697103043637696, + "grad_norm": 0.7029327750205994, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 5140 + }, + { + "epoch": 3.777044371103777, + "grad_norm": 0.5880070328712463, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 5150 + }, + { + "epoch": 3.7843784378437846, + "grad_norm": 0.7578945159912109, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 5160 + }, + { + "epoch": 3.7917125045837916, + "grad_norm": 0.8276378512382507, + "learning_rate": 0.0002, + "loss": 0.8819, + "step": 5170 + }, + { + "epoch": 3.799046571323799, + "grad_norm": 0.7627953886985779, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 5180 + }, + { + "epoch": 3.806380638063806, + "grad_norm": 0.8169086575508118, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 5190 + }, + { + "epoch": 3.8137147048038136, + "grad_norm": 0.6605030298233032, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 5200 + }, + { + "epoch": 3.821048771543821, + "grad_norm": 0.5837286114692688, + "learning_rate": 0.0002, + "loss": 0.8804, + "step": 5210 + }, + { + "epoch": 3.8283828382838285, + "grad_norm": 1.2422157526016235, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 5220 + }, + { + "epoch": 3.8357169050238356, + "grad_norm": 0.6589220762252808, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 5230 + }, + { + "epoch": 3.843050971763843, + "grad_norm": 0.8567556142807007, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5240 + }, + { + "epoch": 3.8503850385038505, + "grad_norm": 0.6490627527236938, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 5250 + }, + { + "epoch": 3.8577191052438575, + "grad_norm": 0.620232880115509, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5260 + }, + { + "epoch": 3.865053171983865, + "grad_norm": 0.7685128450393677, + "learning_rate": 0.0002, + "loss": 0.9192, + "step": 5270 + }, + { + "epoch": 3.8723872387238725, + "grad_norm": 0.8113296627998352, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 5280 + }, + { + "epoch": 3.87972130546388, + "grad_norm": 0.8092675805091858, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 5290 + }, + { + "epoch": 3.887055372203887, + "grad_norm": 0.583570122718811, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 5300 + }, + { + "epoch": 3.8943894389438944, + "grad_norm": 1.712363600730896, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 5310 + }, + { + "epoch": 3.9017235056839015, + "grad_norm": 0.6673534512519836, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5320 + }, + { + "epoch": 3.909057572423909, + "grad_norm": 1.9770312309265137, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5330 + }, + { + "epoch": 3.9163916391639164, + "grad_norm": 0.6430999636650085, + "learning_rate": 0.0002, + "loss": 0.8793, + "step": 5340 + }, + { + "epoch": 3.923725705903924, + "grad_norm": 1.0159571170806885, + "learning_rate": 0.0002, + "loss": 0.839, + "step": 5350 + }, + { + "epoch": 3.931059772643931, + "grad_norm": 0.8607584834098816, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 5360 + }, + { + "epoch": 3.9383938393839384, + "grad_norm": 0.6967900991439819, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 5370 + }, + { + "epoch": 3.945727906123946, + "grad_norm": 0.7683077454566956, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 5380 + }, + { + "epoch": 3.953061972863953, + "grad_norm": 0.6805762648582458, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5390 + }, + { + "epoch": 3.9603960396039604, + "grad_norm": 0.7033619284629822, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5400 + }, + { + "epoch": 3.967730106343968, + "grad_norm": 0.966112494468689, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5410 + }, + { + "epoch": 3.9750641730839753, + "grad_norm": 0.8467881083488464, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 5420 + }, + { + "epoch": 3.9823982398239823, + "grad_norm": 0.8005317449569702, + "learning_rate": 0.0002, + "loss": 0.8084, + "step": 5430 + }, + { + "epoch": 3.98973230656399, + "grad_norm": 1.1615241765975952, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 5440 + }, + { + "epoch": 3.997066373303997, + "grad_norm": 0.6121614575386047, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 5450 + }, + { + "epoch": 4.0, + "eval_loss": 1.1834222078323364, + "eval_runtime": 32.7569, + "eval_samples_per_second": 13.158, + "eval_steps_per_second": 1.649, + "step": 5454 + }, + { + "epoch": 4.004400440044004, + "grad_norm": 0.6055727005004883, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 5460 + }, + { + "epoch": 4.011734506784012, + "grad_norm": 0.8232647180557251, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 5470 + }, + { + "epoch": 4.019068573524019, + "grad_norm": 0.7739192247390747, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 5480 + }, + { + "epoch": 4.026402640264027, + "grad_norm": 0.6264950633049011, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 5490 + }, + { + "epoch": 4.033736707004033, + "grad_norm": 1.4798702001571655, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 5500 + }, + { + "epoch": 4.041070773744041, + "grad_norm": 0.9538470506668091, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 5510 + }, + { + "epoch": 4.048404840484048, + "grad_norm": 0.834561288356781, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 5520 + }, + { + "epoch": 4.055738907224056, + "grad_norm": 0.6407850384712219, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 5530 + }, + { + "epoch": 4.063072973964063, + "grad_norm": 0.9035961627960205, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 5540 + }, + { + "epoch": 4.070407040704071, + "grad_norm": 0.842812716960907, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 5550 + }, + { + "epoch": 4.077741107444078, + "grad_norm": 0.8197882175445557, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 5560 + }, + { + "epoch": 4.085075174184085, + "grad_norm": 0.8652673959732056, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 5570 + }, + { + "epoch": 4.092409240924092, + "grad_norm": 0.8048318028450012, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 5580 + }, + { + "epoch": 4.0997433076641, + "grad_norm": 0.9604969024658203, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 5590 + }, + { + "epoch": 4.107077374404107, + "grad_norm": 1.244756817817688, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 5600 + }, + { + "epoch": 4.114411441144115, + "grad_norm": 0.7975269556045532, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 5610 + }, + { + "epoch": 4.121745507884122, + "grad_norm": 0.6130099296569824, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 5620 + }, + { + "epoch": 4.129079574624129, + "grad_norm": 0.7793202996253967, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 5630 + }, + { + "epoch": 4.136413641364136, + "grad_norm": 1.187238335609436, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 5640 + }, + { + "epoch": 4.143747708104144, + "grad_norm": 0.8450375199317932, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 5650 + }, + { + "epoch": 4.151081774844151, + "grad_norm": 0.9006940126419067, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5660 + }, + { + "epoch": 4.158415841584159, + "grad_norm": 0.9447154998779297, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 5670 + }, + { + "epoch": 4.165749908324166, + "grad_norm": 0.798032283782959, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 5680 + }, + { + "epoch": 4.1730839750641735, + "grad_norm": 0.65578693151474, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 5690 + }, + { + "epoch": 4.18041804180418, + "grad_norm": 1.0864700078964233, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 5700 + }, + { + "epoch": 4.187752108544188, + "grad_norm": 0.7344121932983398, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 5710 + }, + { + "epoch": 4.195086175284195, + "grad_norm": 0.9722456932067871, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 5720 + }, + { + "epoch": 4.2024202420242025, + "grad_norm": 1.263814926147461, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 5730 + }, + { + "epoch": 4.20975430876421, + "grad_norm": 0.9622581005096436, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 5740 + }, + { + "epoch": 4.2170883755042174, + "grad_norm": 0.8497143387794495, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 5750 + }, + { + "epoch": 4.224422442244224, + "grad_norm": 0.8248446583747864, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 5760 + }, + { + "epoch": 4.2317565089842315, + "grad_norm": 1.2544798851013184, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 5770 + }, + { + "epoch": 4.239090575724239, + "grad_norm": 0.8224676251411438, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 5780 + }, + { + "epoch": 4.2464246424642464, + "grad_norm": 0.8924877047538757, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 5790 + }, + { + "epoch": 4.253758709204254, + "grad_norm": 0.8545848727226257, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 5800 + }, + { + "epoch": 4.261092775944261, + "grad_norm": 0.8081067800521851, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 5810 + }, + { + "epoch": 4.268426842684269, + "grad_norm": 0.7111002802848816, + "learning_rate": 0.0002, + "loss": 0.6149, + "step": 5820 + }, + { + "epoch": 4.2757609094242754, + "grad_norm": 0.8696979880332947, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 5830 + }, + { + "epoch": 4.283094976164283, + "grad_norm": 0.821401834487915, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 5840 + }, + { + "epoch": 4.29042904290429, + "grad_norm": 0.888908326625824, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 5850 + }, + { + "epoch": 4.297763109644298, + "grad_norm": 1.9380123615264893, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 5860 + }, + { + "epoch": 4.305097176384305, + "grad_norm": 1.121774435043335, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 5870 + }, + { + "epoch": 4.312431243124313, + "grad_norm": 0.9238282442092896, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 5880 + }, + { + "epoch": 4.319765309864319, + "grad_norm": 0.7321620583534241, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 5890 + }, + { + "epoch": 4.327099376604327, + "grad_norm": 0.8739548325538635, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 5900 + }, + { + "epoch": 4.334433443344334, + "grad_norm": 0.9686012268066406, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 5910 + }, + { + "epoch": 4.341767510084342, + "grad_norm": 0.9033839106559753, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 5920 + }, + { + "epoch": 4.349101576824349, + "grad_norm": 0.8131115436553955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 5930 + }, + { + "epoch": 4.356435643564357, + "grad_norm": 0.8942412734031677, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 5940 + }, + { + "epoch": 4.363769710304364, + "grad_norm": 0.8439112901687622, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 5950 + }, + { + "epoch": 4.371103777044371, + "grad_norm": 0.9176713228225708, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 5960 + }, + { + "epoch": 4.378437843784378, + "grad_norm": 0.6799634695053101, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 5970 + }, + { + "epoch": 4.385771910524386, + "grad_norm": 1.0435824394226074, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5980 + }, + { + "epoch": 4.393105977264393, + "grad_norm": 0.997937798500061, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 5990 + }, + { + "epoch": 4.400440044004401, + "grad_norm": 1.0308842658996582, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 6000 + }, + { + "epoch": 4.407774110744408, + "grad_norm": 1.3683775663375854, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 6010 + }, + { + "epoch": 4.415108177484415, + "grad_norm": 0.7569534182548523, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 6020 + }, + { + "epoch": 4.422442244224422, + "grad_norm": 1.089978575706482, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 6030 + }, + { + "epoch": 4.42977631096443, + "grad_norm": 0.7522459626197815, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 6040 + }, + { + "epoch": 4.437110377704437, + "grad_norm": 0.6709823608398438, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 6050 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.6992089748382568, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 6060 + }, + { + "epoch": 4.451778511184452, + "grad_norm": 1.0182931423187256, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 6070 + }, + { + "epoch": 4.459112577924459, + "grad_norm": 1.0685160160064697, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 6080 + }, + { + "epoch": 4.466446644664466, + "grad_norm": 0.8295124769210815, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 6090 + }, + { + "epoch": 4.473780711404474, + "grad_norm": 1.1862998008728027, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6100 + }, + { + "epoch": 4.481114778144481, + "grad_norm": 0.7400273084640503, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 6110 + }, + { + "epoch": 4.488448844884489, + "grad_norm": 0.7098417282104492, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 6120 + }, + { + "epoch": 4.495782911624496, + "grad_norm": 0.9745053648948669, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 6130 + }, + { + "epoch": 4.503116978364503, + "grad_norm": 0.8638797998428345, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 6140 + }, + { + "epoch": 4.51045104510451, + "grad_norm": 0.8291046619415283, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6150 + }, + { + "epoch": 4.517785111844518, + "grad_norm": 1.0301737785339355, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 6160 + }, + { + "epoch": 4.525119178584525, + "grad_norm": 1.1996512413024902, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 6170 + }, + { + "epoch": 4.5324532453245325, + "grad_norm": 1.151038408279419, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 6180 + }, + { + "epoch": 4.53978731206454, + "grad_norm": 0.8385201096534729, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 6190 + }, + { + "epoch": 4.5471213788045475, + "grad_norm": 0.8969188332557678, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 6200 + }, + { + "epoch": 4.554455445544555, + "grad_norm": 1.60659658908844, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 6210 + }, + { + "epoch": 4.5617895122845615, + "grad_norm": 0.9356731176376343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 6220 + }, + { + "epoch": 4.569123579024569, + "grad_norm": 0.95856773853302, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 6230 + }, + { + "epoch": 4.5764576457645765, + "grad_norm": 1.1162524223327637, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 6240 + }, + { + "epoch": 4.583791712504584, + "grad_norm": 0.8809238076210022, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 6250 + }, + { + "epoch": 4.591125779244591, + "grad_norm": 0.890738844871521, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 6260 + }, + { + "epoch": 4.598459845984598, + "grad_norm": 0.918684720993042, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 6270 + }, + { + "epoch": 4.6057939127246055, + "grad_norm": 0.8156296610832214, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 6280 + }, + { + "epoch": 4.613127979464613, + "grad_norm": 1.046634316444397, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6290 + }, + { + "epoch": 4.62046204620462, + "grad_norm": 0.7725525498390198, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 6300 + }, + { + "epoch": 4.627796112944628, + "grad_norm": 0.9992046356201172, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 6310 + }, + { + "epoch": 4.635130179684635, + "grad_norm": 0.8480095267295837, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 6320 + }, + { + "epoch": 4.642464246424643, + "grad_norm": 0.7061955332756042, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 6330 + }, + { + "epoch": 4.649798313164649, + "grad_norm": 1.0354212522506714, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 6340 + }, + { + "epoch": 4.657132379904657, + "grad_norm": 1.0081377029418945, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6350 + }, + { + "epoch": 4.664466446644664, + "grad_norm": 1.2904249429702759, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 6360 + }, + { + "epoch": 4.671800513384672, + "grad_norm": 0.9248910546302795, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 6370 + }, + { + "epoch": 4.679134580124679, + "grad_norm": 0.9907804131507874, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 6380 + }, + { + "epoch": 4.686468646864687, + "grad_norm": 1.201143741607666, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 6390 + }, + { + "epoch": 4.693802713604693, + "grad_norm": 0.8709394335746765, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 6400 + }, + { + "epoch": 4.701136780344701, + "grad_norm": 0.7468608021736145, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 6410 + }, + { + "epoch": 4.708470847084708, + "grad_norm": 0.8607903718948364, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 6420 + }, + { + "epoch": 4.715804913824716, + "grad_norm": 0.9840512871742249, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 6430 + }, + { + "epoch": 4.723138980564723, + "grad_norm": 0.8328204154968262, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 6440 + }, + { + "epoch": 4.730473047304731, + "grad_norm": 0.924505352973938, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 6450 + }, + { + "epoch": 4.737807114044738, + "grad_norm": 0.8897685408592224, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 6460 + }, + { + "epoch": 4.745141180784745, + "grad_norm": 0.9605024456977844, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6470 + }, + { + "epoch": 4.752475247524752, + "grad_norm": 0.8150759935379028, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 6480 + }, + { + "epoch": 4.75980931426476, + "grad_norm": 0.8128412961959839, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 6490 + }, + { + "epoch": 4.767143381004767, + "grad_norm": 0.7381404638290405, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 6500 + }, + { + "epoch": 4.774477447744775, + "grad_norm": 1.0565853118896484, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 6510 + }, + { + "epoch": 4.781811514484782, + "grad_norm": 0.9298134446144104, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6520 + }, + { + "epoch": 4.789145581224789, + "grad_norm": 1.0145525932312012, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 6530 + }, + { + "epoch": 4.796479647964796, + "grad_norm": 0.92259681224823, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 6540 + }, + { + "epoch": 4.803813714704804, + "grad_norm": 0.7881024479866028, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 6550 + }, + { + "epoch": 4.811147781444811, + "grad_norm": 1.4935206174850464, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 6560 + }, + { + "epoch": 4.818481848184819, + "grad_norm": 0.8612369298934937, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 6570 + }, + { + "epoch": 4.825815914924826, + "grad_norm": 1.0118653774261475, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 6580 + }, + { + "epoch": 4.833149981664834, + "grad_norm": 1.1303809881210327, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 6590 + }, + { + "epoch": 4.84048404840484, + "grad_norm": 0.9112492203712463, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 6600 + }, + { + "epoch": 4.847818115144848, + "grad_norm": 0.864762544631958, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 6610 + }, + { + "epoch": 4.855152181884855, + "grad_norm": 0.9090572595596313, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 6620 + }, + { + "epoch": 4.862486248624863, + "grad_norm": 1.014953374862671, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 6630 + }, + { + "epoch": 4.86982031536487, + "grad_norm": 1.0702149868011475, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 6640 + }, + { + "epoch": 4.8771543821048775, + "grad_norm": 1.002135157585144, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 6650 + }, + { + "epoch": 4.884488448844884, + "grad_norm": 0.862545907497406, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 6660 + }, + { + "epoch": 4.891822515584892, + "grad_norm": 0.7302131056785583, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 6670 + }, + { + "epoch": 4.899156582324899, + "grad_norm": 0.8380730152130127, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 6680 + }, + { + "epoch": 4.9064906490649065, + "grad_norm": 0.7956018447875977, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 6690 + }, + { + "epoch": 4.913824715804914, + "grad_norm": 0.6717583537101746, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 6700 + }, + { + "epoch": 4.9211587825449215, + "grad_norm": 1.09099280834198, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 6710 + }, + { + "epoch": 4.928492849284929, + "grad_norm": 0.8589889407157898, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 6720 + }, + { + "epoch": 4.9358269160249355, + "grad_norm": 1.0046314001083374, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 6730 + }, + { + "epoch": 4.943160982764943, + "grad_norm": 0.8559659123420715, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 6740 + }, + { + "epoch": 4.9504950495049505, + "grad_norm": 0.8588525652885437, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 6750 + }, + { + "epoch": 4.957829116244958, + "grad_norm": 0.9192708134651184, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 6760 + }, + { + "epoch": 4.965163182984965, + "grad_norm": 1.051398754119873, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 6770 + }, + { + "epoch": 4.972497249724973, + "grad_norm": 0.9111362099647522, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 6780 + }, + { + "epoch": 4.9798313164649795, + "grad_norm": 0.7305638194084167, + "learning_rate": 0.0002, + "loss": 0.7613, + "step": 6790 + }, + { + "epoch": 4.987165383204987, + "grad_norm": 1.118837594985962, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 6800 + }, + { + "epoch": 4.994499449944994, + "grad_norm": 0.9075239300727844, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 6810 + }, + { + "epoch": 4.999633296662999, + "eval_loss": 1.2361247539520264, + "eval_runtime": 32.7325, + "eval_samples_per_second": 13.167, + "eval_steps_per_second": 1.65, + "step": 6817 + } + ], + "logging_steps": 10, + "max_steps": 10904, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.1549863499923456e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-6817/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2f84d0c20b043a302eae9c8985fa4b100bc27236 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc82fdb16bc9320508ca18eb240c5dc89509a062e490c9a6ec2f233ff61a96a6 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..704898495d039c14404a6c9ddccd0d6fb21c9cfa --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13e882cee82cf7301b2f56c95a7c530935d8b933b312beb0d6854b0d7d72c702 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..063e77d95a8152895f0ef43259d8b615ce355576 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aca658d9e71a3ab50f947b897f388acf6699ad2bd6b92adae6468a2433e3aa3d +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..668af33cb4c86c5dba1e08264d59f56026f1d3ae --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fe03bf7018c0aa325897c70d0d73733264f489b69d7fd26965086e90396ce58 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..233d2cad79f5f691aa15964b0f5532581caea619 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/trainer_state.json @@ -0,0 +1,5807 @@ +{ + "best_metric": 1.1534006595611572, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 8181, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007334066740007334, + "grad_norm": 0.47521963715553284, + "learning_rate": 0.0002, + "loss": 1.9722, + "step": 10 + }, + { + "epoch": 0.014668133480014669, + "grad_norm": 0.5395162105560303, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 20 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 0.4305780231952667, + "learning_rate": 0.0002, + "loss": 1.4202, + "step": 30 + }, + { + "epoch": 0.029336266960029337, + "grad_norm": 0.6938246488571167, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 40 + }, + { + "epoch": 0.03667033370003667, + "grad_norm": 1.5133819580078125, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 50 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 0.9173883199691772, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 60 + }, + { + "epoch": 0.05133846718005134, + "grad_norm": 0.4619861841201782, + "learning_rate": 0.0002, + "loss": 1.2844, + "step": 70 + }, + { + "epoch": 0.058672533920058674, + "grad_norm": 0.46118637919425964, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 80 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 0.4468648135662079, + "learning_rate": 0.0002, + "loss": 1.3441, + "step": 90 + }, + { + "epoch": 0.07334066740007333, + "grad_norm": 0.46123769879341125, + "learning_rate": 0.0002, + "loss": 1.1863, + "step": 100 + }, + { + "epoch": 0.08067473414008068, + "grad_norm": 0.4859139025211334, + "learning_rate": 0.0002, + "loss": 1.2772, + "step": 110 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 0.4384922385215759, + "learning_rate": 0.0002, + "loss": 1.2087, + "step": 120 + }, + { + "epoch": 0.09534286762009535, + "grad_norm": 0.39519360661506653, + "learning_rate": 0.0002, + "loss": 1.2927, + "step": 130 + }, + { + "epoch": 0.10267693436010268, + "grad_norm": 0.4049859344959259, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 140 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 0.4605638086795807, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 150 + }, + { + "epoch": 0.11734506784011735, + "grad_norm": 0.4201928377151489, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 160 + }, + { + "epoch": 0.12467913458012468, + "grad_norm": 0.5367777347564697, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 170 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 0.41752299666404724, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 180 + }, + { + "epoch": 0.13934726806013933, + "grad_norm": 0.31597763299942017, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 190 + }, + { + "epoch": 0.14668133480014667, + "grad_norm": 0.7468788623809814, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 200 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 0.3403034508228302, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 210 + }, + { + "epoch": 0.16134946828016136, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.1686835350201687, + "grad_norm": 0.356158971786499, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 230 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 0.3448857367038727, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 240 + }, + { + "epoch": 0.18335166850018336, + "grad_norm": 0.3475699722766876, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 250 + }, + { + "epoch": 0.1906857352401907, + "grad_norm": 0.2770358622074127, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 260 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.4310270845890045, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 270 + }, + { + "epoch": 0.20535386872020536, + "grad_norm": 0.335041880607605, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 280 + }, + { + "epoch": 0.2126879354602127, + "grad_norm": 0.3420602083206177, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 290 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 0.325001060962677, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 300 + }, + { + "epoch": 0.22735606894022736, + "grad_norm": 0.3027827739715576, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 310 + }, + { + "epoch": 0.2346901356802347, + "grad_norm": 0.435550719499588, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 320 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 0.3884522616863251, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 330 + }, + { + "epoch": 0.24935826916024936, + "grad_norm": 0.7736002206802368, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 340 + }, + { + "epoch": 0.2566923359002567, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0002, + "loss": 1.3606, + "step": 350 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.3311890959739685, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 360 + }, + { + "epoch": 0.27136046938027136, + "grad_norm": 0.7473500370979309, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 370 + }, + { + "epoch": 0.27869453612027867, + "grad_norm": 0.3681875765323639, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 380 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 0.3764737844467163, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 390 + }, + { + "epoch": 0.29336266960029334, + "grad_norm": 0.4243989586830139, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 400 + }, + { + "epoch": 0.3006967363403007, + "grad_norm": 0.2658531963825226, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 410 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 0.3436793386936188, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.31536486982031536, + "grad_norm": 0.5101129412651062, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 430 + }, + { + "epoch": 0.3226989365603227, + "grad_norm": 0.3319750726222992, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 440 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.385148286819458, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 450 + }, + { + "epoch": 0.3373670700403374, + "grad_norm": 0.3477935791015625, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 460 + }, + { + "epoch": 0.3447011367803447, + "grad_norm": 0.29748716950416565, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 470 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 0.34083324670791626, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 480 + }, + { + "epoch": 0.35936927026035936, + "grad_norm": 0.36904552578926086, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 490 + }, + { + "epoch": 0.3667033370003667, + "grad_norm": 0.315483033657074, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 500 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.44897955656051636, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 510 + }, + { + "epoch": 0.3813714704803814, + "grad_norm": 0.3160701394081116, + "learning_rate": 0.0002, + "loss": 1.3035, + "step": 520 + }, + { + "epoch": 0.3887055372203887, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 530 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5430002808570862, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 540 + }, + { + "epoch": 0.40337367070040336, + "grad_norm": 0.2908070683479309, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 550 + }, + { + "epoch": 0.4107077374404107, + "grad_norm": 0.35066530108451843, + "learning_rate": 0.0002, + "loss": 1.2384, + "step": 560 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.37588003277778625, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 570 + }, + { + "epoch": 0.4253758709204254, + "grad_norm": 0.3112126886844635, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 580 + }, + { + "epoch": 0.4327099376604327, + "grad_norm": 0.35577139258384705, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 590 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.31706422567367554, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 600 + }, + { + "epoch": 0.44737807114044736, + "grad_norm": 0.3249092102050781, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 610 + }, + { + "epoch": 0.4547121378804547, + "grad_norm": 0.3842705488204956, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 620 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.390991747379303, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 630 + }, + { + "epoch": 0.4693802713604694, + "grad_norm": 0.27532413601875305, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 640 + }, + { + "epoch": 0.4767143381004767, + "grad_norm": 0.31412816047668457, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 650 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.32117101550102234, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.49138247158049136, + "grad_norm": 0.3810010254383087, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 670 + }, + { + "epoch": 0.4987165383204987, + "grad_norm": 0.36289164423942566, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 680 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.34458720684051514, + "learning_rate": 0.0002, + "loss": 1.2034, + "step": 690 + }, + { + "epoch": 0.5133846718005134, + "grad_norm": 0.32844600081443787, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 700 + }, + { + "epoch": 0.5207187385405208, + "grad_norm": 0.3144175708293915, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 710 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.3898887634277344, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 720 + }, + { + "epoch": 0.5353868720205354, + "grad_norm": 1.3220758438110352, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 730 + }, + { + "epoch": 0.5427209387605427, + "grad_norm": 0.3635874390602112, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 740 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.3138217628002167, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 750 + }, + { + "epoch": 0.5573890722405573, + "grad_norm": 0.4063207805156708, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.5647231389805647, + "grad_norm": 0.3926219940185547, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 770 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.31954652070999146, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 780 + }, + { + "epoch": 0.5793912724605794, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 790 + }, + { + "epoch": 0.5867253392005867, + "grad_norm": 0.643004834651947, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 800 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.3479592800140381, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 810 + }, + { + "epoch": 0.6013934726806014, + "grad_norm": 0.4684754014015198, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 820 + }, + { + "epoch": 0.6087275394206088, + "grad_norm": 0.3739790916442871, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 830 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.40884748101234436, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 840 + }, + { + "epoch": 0.6233956729006234, + "grad_norm": 0.9722164273262024, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 850 + }, + { + "epoch": 0.6307297396406307, + "grad_norm": 0.42992347478866577, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 860 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.36654195189476013, + "learning_rate": 0.0002, + "loss": 1.1339, + "step": 870 + }, + { + "epoch": 0.6453978731206454, + "grad_norm": 0.4113832116127014, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 880 + }, + { + "epoch": 0.6527319398606527, + "grad_norm": 0.2948838770389557, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 890 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.38330280780792236, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 900 + }, + { + "epoch": 0.6674000733406674, + "grad_norm": 0.4428867697715759, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 910 + }, + { + "epoch": 0.6747341400806748, + "grad_norm": 0.23659265041351318, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 920 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.323685884475708, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 930 + }, + { + "epoch": 0.6894022735606894, + "grad_norm": 0.39157727360725403, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 940 + }, + { + "epoch": 0.6967363403006968, + "grad_norm": 0.27189481258392334, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 950 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.529883861541748, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 960 + }, + { + "epoch": 0.7114044737807114, + "grad_norm": 0.34758689999580383, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 970 + }, + { + "epoch": 0.7187385405207187, + "grad_norm": 0.831749439239502, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 980 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.4438304007053375, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 990 + }, + { + "epoch": 0.7334066740007334, + "grad_norm": 0.33840006589889526, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 1000 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3454797863960266, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1010 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.38999441266059875, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 1020 + }, + { + "epoch": 0.7554088742207554, + "grad_norm": 0.2829911708831787, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1030 + }, + { + "epoch": 0.7627429409607628, + "grad_norm": 0.36918163299560547, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 1040 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3415680229663849, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 1050 + }, + { + "epoch": 0.7774110744407774, + "grad_norm": 0.2974182963371277, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 1060 + }, + { + "epoch": 0.7847451411807848, + "grad_norm": 0.3880919814109802, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 1070 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.33503302931785583, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 1080 + }, + { + "epoch": 0.7994132746607994, + "grad_norm": 0.3728407025337219, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1090 + }, + { + "epoch": 0.8067473414008067, + "grad_norm": 0.3509373664855957, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 1100 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.42228564620018005, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 1110 + }, + { + "epoch": 0.8214154748808215, + "grad_norm": 0.313467800617218, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 1120 + }, + { + "epoch": 0.8287495416208287, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 1130 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.43200382590293884, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 1140 + }, + { + "epoch": 0.8434176751008434, + "grad_norm": 0.3309599459171295, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 1150 + }, + { + "epoch": 0.8507517418408508, + "grad_norm": 0.3526846170425415, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1160 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 1.2722247838974, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 1170 + }, + { + "epoch": 0.8654198753208654, + "grad_norm": 0.34142059087753296, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 1180 + }, + { + "epoch": 0.8727539420608728, + "grad_norm": 0.3805823028087616, + "learning_rate": 0.0002, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.3931232690811157, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 1200 + }, + { + "epoch": 0.8874220755408874, + "grad_norm": 0.2937372624874115, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 1210 + }, + { + "epoch": 0.8947561422808947, + "grad_norm": 0.3757196366786957, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 1220 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.3502705991268158, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 1230 + }, + { + "epoch": 0.9094242757609095, + "grad_norm": 0.32758915424346924, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 1240 + }, + { + "epoch": 0.9167583425009168, + "grad_norm": 0.37199416756629944, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 1250 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.3551490604877472, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 1260 + }, + { + "epoch": 0.9314264759809314, + "grad_norm": 0.2859550714492798, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 1270 + }, + { + "epoch": 0.9387605427209388, + "grad_norm": 0.427990585565567, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 1280 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.33717992901802063, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 1290 + }, + { + "epoch": 0.9534286762009534, + "grad_norm": 0.30225634574890137, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 1300 + }, + { + "epoch": 0.9607627429409608, + "grad_norm": 0.385821133852005, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 1310 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.9754308764209755, + "grad_norm": 0.49987098574638367, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 1330 + }, + { + "epoch": 0.9827649431609827, + "grad_norm": 0.3842747211456299, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 1340 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.6274653673171997, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1350 + }, + { + "epoch": 0.9974330766409975, + "grad_norm": 0.5239808559417725, + "learning_rate": 0.0002, + "loss": 1.124, + "step": 1360 + }, + { + "epoch": 0.9996332966629996, + "eval_loss": 1.1822267770767212, + "eval_runtime": 32.7389, + "eval_samples_per_second": 13.165, + "eval_steps_per_second": 1.649, + "step": 1363 + }, + { + "epoch": 1.0047671433810048, + "grad_norm": 0.45311301946640015, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 1370 + }, + { + "epoch": 1.012101210121012, + "grad_norm": 0.29685574769973755, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1380 + }, + { + "epoch": 1.0194352768610195, + "grad_norm": 0.3290937840938568, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 1390 + }, + { + "epoch": 1.0267693436010268, + "grad_norm": 0.3801758587360382, + "learning_rate": 0.0002, + "loss": 1.0295, + "step": 1400 + }, + { + "epoch": 1.034103410341034, + "grad_norm": 0.794174313545227, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 1410 + }, + { + "epoch": 1.0414374770810415, + "grad_norm": 0.3854154646396637, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 1420 + }, + { + "epoch": 1.0487715438210488, + "grad_norm": 0.32702451944351196, + "learning_rate": 0.0002, + "loss": 1.0652, + "step": 1430 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 0.7815203666687012, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 1440 + }, + { + "epoch": 1.0634396773010635, + "grad_norm": 0.3087436854839325, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1450 + }, + { + "epoch": 1.0707737440410707, + "grad_norm": 0.3847602903842926, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 1460 + }, + { + "epoch": 1.0781078107810782, + "grad_norm": 0.3693031370639801, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1470 + }, + { + "epoch": 1.0854418775210855, + "grad_norm": 0.4111202359199524, + "learning_rate": 0.0002, + "loss": 1.0995, + "step": 1480 + }, + { + "epoch": 1.0927759442610927, + "grad_norm": 0.41452381014823914, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 1490 + }, + { + "epoch": 1.1001100110011002, + "grad_norm": 0.3336445093154907, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 1500 + }, + { + "epoch": 1.1074440777411074, + "grad_norm": 0.3923407793045044, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 1510 + }, + { + "epoch": 1.1147781444811147, + "grad_norm": 0.46215683221817017, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 1520 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.3592156767845154, + "learning_rate": 0.0002, + "loss": 1.1133, + "step": 1530 + }, + { + "epoch": 1.1294462779611294, + "grad_norm": 0.361110657453537, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 1540 + }, + { + "epoch": 1.1367803447011369, + "grad_norm": 0.5317131280899048, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 1550 + }, + { + "epoch": 1.1441144114411441, + "grad_norm": 0.3882388174533844, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 1560 + }, + { + "epoch": 1.1514484781811514, + "grad_norm": 0.3259428143501282, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 1570 + }, + { + "epoch": 1.1587825449211588, + "grad_norm": 0.410935640335083, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 1580 + }, + { + "epoch": 1.166116611661166, + "grad_norm": 0.44940185546875, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 1590 + }, + { + "epoch": 1.1734506784011733, + "grad_norm": 0.5106484293937683, + "learning_rate": 0.0002, + "loss": 1.0334, + "step": 1600 + }, + { + "epoch": 1.1807847451411808, + "grad_norm": 0.6603665947914124, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 1610 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.4799964129924774, + "learning_rate": 0.0002, + "loss": 1.1227, + "step": 1620 + }, + { + "epoch": 1.1954528786211955, + "grad_norm": 0.4389883279800415, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 1630 + }, + { + "epoch": 1.2027869453612028, + "grad_norm": 0.4188813269138336, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 1640 + }, + { + "epoch": 1.21012101210121, + "grad_norm": 0.7132157683372498, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 1650 + }, + { + "epoch": 1.2174550788412175, + "grad_norm": 0.507480263710022, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1660 + }, + { + "epoch": 1.2247891455812248, + "grad_norm": 0.9452332854270935, + "learning_rate": 0.0002, + "loss": 0.9948, + "step": 1670 + }, + { + "epoch": 1.2321232123212322, + "grad_norm": 0.4121614992618561, + "learning_rate": 0.0002, + "loss": 1.0228, + "step": 1680 + }, + { + "epoch": 1.2394572790612395, + "grad_norm": 0.34230247139930725, + "learning_rate": 0.0002, + "loss": 1.0366, + "step": 1690 + }, + { + "epoch": 1.2467913458012467, + "grad_norm": 0.4026208817958832, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 1700 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.46673697233200073, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1710 + }, + { + "epoch": 1.2614594792812615, + "grad_norm": 0.38349825143814087, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 1720 + }, + { + "epoch": 1.2687935460212687, + "grad_norm": 0.4049997627735138, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 1730 + }, + { + "epoch": 1.2761276127612762, + "grad_norm": 0.3417615294456482, + "learning_rate": 0.0002, + "loss": 0.9504, + "step": 1740 + }, + { + "epoch": 1.2834616795012834, + "grad_norm": 0.4277614951133728, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 1750 + }, + { + "epoch": 1.2907957462412907, + "grad_norm": 0.5864202976226807, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 1760 + }, + { + "epoch": 1.2981298129812981, + "grad_norm": 0.7097493410110474, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 1770 + }, + { + "epoch": 1.3054638797213054, + "grad_norm": 0.3145381212234497, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 1780 + }, + { + "epoch": 1.3127979464613129, + "grad_norm": 0.5116165280342102, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 1790 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 0.7469736337661743, + "learning_rate": 0.0002, + "loss": 1.0765, + "step": 1800 + }, + { + "epoch": 1.3274660799413276, + "grad_norm": 0.32272255420684814, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 1810 + }, + { + "epoch": 1.3348001466813348, + "grad_norm": 0.3534623086452484, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 1820 + }, + { + "epoch": 1.342134213421342, + "grad_norm": 0.36127907037734985, + "learning_rate": 0.0002, + "loss": 1.1628, + "step": 1830 + }, + { + "epoch": 1.3494682801613496, + "grad_norm": 0.4072401523590088, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 1840 + }, + { + "epoch": 1.3568023469013568, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 1850 + }, + { + "epoch": 1.364136413641364, + "grad_norm": 0.412883460521698, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 1860 + }, + { + "epoch": 1.3714704803813715, + "grad_norm": 0.3735875189304352, + "learning_rate": 0.0002, + "loss": 1.0265, + "step": 1870 + }, + { + "epoch": 1.3788045471213788, + "grad_norm": 0.39158159494400024, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 1880 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.44431769847869873, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 1890 + }, + { + "epoch": 1.3934726806013935, + "grad_norm": 0.37772801518440247, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 1900 + }, + { + "epoch": 1.4008067473414008, + "grad_norm": 0.4056641757488251, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 1910 + }, + { + "epoch": 1.408140814081408, + "grad_norm": 0.41612377762794495, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 1920 + }, + { + "epoch": 1.4154748808214155, + "grad_norm": 0.41153013706207275, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 1930 + }, + { + "epoch": 1.4228089475614227, + "grad_norm": 0.387845516204834, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1940 + }, + { + "epoch": 1.4301430143014302, + "grad_norm": 0.3809587061405182, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 1950 + }, + { + "epoch": 1.4374770810414375, + "grad_norm": 0.3625726103782654, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1960 + }, + { + "epoch": 1.444811147781445, + "grad_norm": 0.5294290781021118, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1970 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 0.39975494146347046, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 1980 + }, + { + "epoch": 1.4594792812614594, + "grad_norm": 0.4181167185306549, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 1990 + }, + { + "epoch": 1.466813348001467, + "grad_norm": 0.42001503705978394, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 2000 + }, + { + "epoch": 1.4741474147414741, + "grad_norm": 0.4877578616142273, + "learning_rate": 0.0002, + "loss": 1.1266, + "step": 2010 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.4050969183444977, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 2020 + }, + { + "epoch": 1.4888155482214889, + "grad_norm": 0.39068883657455444, + "learning_rate": 0.0002, + "loss": 1.0562, + "step": 2030 + }, + { + "epoch": 1.4961496149614961, + "grad_norm": 0.421282559633255, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 2040 + }, + { + "epoch": 1.5034836817015034, + "grad_norm": 0.47092297673225403, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 2050 + }, + { + "epoch": 1.5108177484415108, + "grad_norm": 0.39688974618911743, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 2060 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.5529879331588745, + "learning_rate": 0.0002, + "loss": 1.08, + "step": 2070 + }, + { + "epoch": 1.5254858819215253, + "grad_norm": 0.4879782199859619, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 2080 + }, + { + "epoch": 1.5328199486615328, + "grad_norm": 0.5517361164093018, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 2090 + }, + { + "epoch": 1.5401540154015403, + "grad_norm": 0.44015637040138245, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2100 + }, + { + "epoch": 1.5474880821415475, + "grad_norm": 0.5435167551040649, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 2110 + }, + { + "epoch": 1.5548221488815548, + "grad_norm": 0.5714033246040344, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 2120 + }, + { + "epoch": 1.5621562156215623, + "grad_norm": 0.31732529401779175, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 2130 + }, + { + "epoch": 1.5694902823615695, + "grad_norm": 0.49068278074264526, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 2140 + }, + { + "epoch": 1.5768243491015768, + "grad_norm": 0.46851542592048645, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 2150 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.5083092451095581, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 2160 + }, + { + "epoch": 1.5914924825815915, + "grad_norm": 0.9822936058044434, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 2170 + }, + { + "epoch": 1.5988265493215987, + "grad_norm": 0.4575989246368408, + "learning_rate": 0.0002, + "loss": 0.9986, + "step": 2180 + }, + { + "epoch": 1.6061606160616062, + "grad_norm": 0.47444286942481995, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 2190 + }, + { + "epoch": 1.6134946828016135, + "grad_norm": 0.7208226919174194, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 2200 + }, + { + "epoch": 1.6208287495416207, + "grad_norm": 0.43791481852531433, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 2210 + }, + { + "epoch": 1.6281628162816282, + "grad_norm": 0.5245792865753174, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 2220 + }, + { + "epoch": 1.6354968830216357, + "grad_norm": 0.39289429783821106, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 2230 + }, + { + "epoch": 1.6428309497616427, + "grad_norm": 0.6106135845184326, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 2240 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.3722580671310425, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 2250 + }, + { + "epoch": 1.6574990832416576, + "grad_norm": 0.3649403750896454, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2260 + }, + { + "epoch": 1.6648331499816649, + "grad_norm": 0.46514248847961426, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 2270 + }, + { + "epoch": 1.6721672167216721, + "grad_norm": 0.42034927010536194, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 2280 + }, + { + "epoch": 1.6795012834616796, + "grad_norm": 0.45202910900115967, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 2290 + }, + { + "epoch": 1.6868353502016868, + "grad_norm": 0.36257603764533997, + "learning_rate": 0.0002, + "loss": 1.0866, + "step": 2300 + }, + { + "epoch": 1.694169416941694, + "grad_norm": 0.6340323090553284, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 2310 + }, + { + "epoch": 1.7015034836817016, + "grad_norm": 0.4352878928184509, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 2320 + }, + { + "epoch": 1.7088375504217088, + "grad_norm": 0.45029792189598083, + "learning_rate": 0.0002, + "loss": 1.0629, + "step": 2330 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.3891315758228302, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 2340 + }, + { + "epoch": 1.7235056839017235, + "grad_norm": 0.35180050134658813, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2350 + }, + { + "epoch": 1.7308397506417308, + "grad_norm": 0.42367449402809143, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 2360 + }, + { + "epoch": 1.738173817381738, + "grad_norm": 0.4553675353527069, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 2370 + }, + { + "epoch": 1.7455078841217455, + "grad_norm": 0.5944654941558838, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 2380 + }, + { + "epoch": 1.752841950861753, + "grad_norm": 0.3479664623737335, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 2390 + }, + { + "epoch": 1.76017601760176, + "grad_norm": 0.3585502505302429, + "learning_rate": 0.0002, + "loss": 1.0798, + "step": 2400 + }, + { + "epoch": 1.7675100843417675, + "grad_norm": 0.4263346493244171, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 2410 + }, + { + "epoch": 1.774844151081775, + "grad_norm": 0.5476409196853638, + "learning_rate": 0.0002, + "loss": 1.054, + "step": 2420 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 0.3694186508655548, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 2430 + }, + { + "epoch": 1.7895122845617895, + "grad_norm": 0.9185658693313599, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 2440 + }, + { + "epoch": 1.796846351301797, + "grad_norm": 0.7171908020973206, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2450 + }, + { + "epoch": 1.8041804180418042, + "grad_norm": 0.550658643245697, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 2460 + }, + { + "epoch": 1.8115144847818114, + "grad_norm": 0.4075568914413452, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2470 + }, + { + "epoch": 1.818848551521819, + "grad_norm": 0.3790127635002136, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 2480 + }, + { + "epoch": 1.8261826182618262, + "grad_norm": 0.3576384484767914, + "learning_rate": 0.0002, + "loss": 0.9839, + "step": 2490 + }, + { + "epoch": 1.8335166850018334, + "grad_norm": 0.3919370770454407, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 2500 + }, + { + "epoch": 1.8408507517418409, + "grad_norm": 0.485083669424057, + "learning_rate": 0.0002, + "loss": 0.9985, + "step": 2510 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.4564347565174103, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 2520 + }, + { + "epoch": 1.8555188852218554, + "grad_norm": 0.3613106608390808, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 2530 + }, + { + "epoch": 1.8628529519618628, + "grad_norm": 0.39600759744644165, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 2540 + }, + { + "epoch": 1.8701870187018703, + "grad_norm": 1.123499870300293, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 2550 + }, + { + "epoch": 1.8775210854418776, + "grad_norm": 0.4612680673599243, + "learning_rate": 0.0002, + "loss": 1.0635, + "step": 2560 + }, + { + "epoch": 1.8848551521818848, + "grad_norm": 0.42745399475097656, + "learning_rate": 0.0002, + "loss": 1.0087, + "step": 2570 + }, + { + "epoch": 1.8921892189218923, + "grad_norm": 0.4055580198764801, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 2580 + }, + { + "epoch": 1.8995232856618995, + "grad_norm": 0.44174644351005554, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 2590 + }, + { + "epoch": 1.9068573524019068, + "grad_norm": 1.0228385925292969, + "learning_rate": 0.0002, + "loss": 0.9886, + "step": 2600 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.3496396243572235, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 2610 + }, + { + "epoch": 1.9215254858819215, + "grad_norm": 0.4191173017024994, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2620 + }, + { + "epoch": 1.9288595526219288, + "grad_norm": 0.6778554916381836, + "learning_rate": 0.0002, + "loss": 1.0943, + "step": 2630 + }, + { + "epoch": 1.9361936193619362, + "grad_norm": 0.41992834210395813, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 2640 + }, + { + "epoch": 1.9435276861019435, + "grad_norm": 0.8760401010513306, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 2650 + }, + { + "epoch": 1.9508617528419507, + "grad_norm": 0.44049209356307983, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 2660 + }, + { + "epoch": 1.9581958195819582, + "grad_norm": 0.5651928782463074, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 2670 + }, + { + "epoch": 1.9655298863219657, + "grad_norm": 0.5292727947235107, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 2680 + }, + { + "epoch": 1.9728639530619727, + "grad_norm": 0.6012240648269653, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 2690 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.3945149779319763, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 2700 + }, + { + "epoch": 1.9875320865419877, + "grad_norm": 0.5732627511024475, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 2710 + }, + { + "epoch": 1.994866153281995, + "grad_norm": 0.3963361084461212, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 2720 + }, + { + "epoch": 2.0, + "eval_loss": 1.1534006595611572, + "eval_runtime": 32.7541, + "eval_samples_per_second": 13.159, + "eval_steps_per_second": 1.649, + "step": 2727 + }, + { + "epoch": 2.002200220022002, + "grad_norm": 0.48628315329551697, + "learning_rate": 0.0002, + "loss": 0.9624, + "step": 2730 + }, + { + "epoch": 2.0095342867620096, + "grad_norm": 0.413875013589859, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 2740 + }, + { + "epoch": 2.0168683535020167, + "grad_norm": 0.4988735616207123, + "learning_rate": 0.0002, + "loss": 0.965, + "step": 2750 + }, + { + "epoch": 2.024202420242024, + "grad_norm": 0.5634812712669373, + "learning_rate": 0.0002, + "loss": 0.9677, + "step": 2760 + }, + { + "epoch": 2.0315364869820316, + "grad_norm": 0.48302653431892395, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 2770 + }, + { + "epoch": 2.038870553722039, + "grad_norm": 0.49914175271987915, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 2780 + }, + { + "epoch": 2.046204620462046, + "grad_norm": 1.14039945602417, + "learning_rate": 0.0002, + "loss": 0.904, + "step": 2790 + }, + { + "epoch": 2.0535386872020536, + "grad_norm": 0.6359720826148987, + "learning_rate": 0.0002, + "loss": 0.9588, + "step": 2800 + }, + { + "epoch": 2.060872753942061, + "grad_norm": 0.4589158296585083, + "learning_rate": 0.0002, + "loss": 0.9031, + "step": 2810 + }, + { + "epoch": 2.068206820682068, + "grad_norm": 0.46255481243133545, + "learning_rate": 0.0002, + "loss": 0.9438, + "step": 2820 + }, + { + "epoch": 2.0755408874220755, + "grad_norm": 0.6232137680053711, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 2830 + }, + { + "epoch": 2.082874954162083, + "grad_norm": 0.41042178869247437, + "learning_rate": 0.0002, + "loss": 0.8978, + "step": 2840 + }, + { + "epoch": 2.09020902090209, + "grad_norm": 0.5334428548812866, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 2850 + }, + { + "epoch": 2.0975430876420975, + "grad_norm": 0.8270058631896973, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 2860 + }, + { + "epoch": 2.104877154382105, + "grad_norm": 0.6624533534049988, + "learning_rate": 0.0002, + "loss": 1.0064, + "step": 2870 + }, + { + "epoch": 2.112211221122112, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0002, + "loss": 0.9196, + "step": 2880 + }, + { + "epoch": 2.1195452878621195, + "grad_norm": 0.621482789516449, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 2890 + }, + { + "epoch": 2.126879354602127, + "grad_norm": 0.4556255340576172, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2900 + }, + { + "epoch": 2.1342134213421344, + "grad_norm": 0.4620579183101654, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 2910 + }, + { + "epoch": 2.1415474880821415, + "grad_norm": 0.9602415561676025, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2920 + }, + { + "epoch": 2.148881554822149, + "grad_norm": 0.587943971157074, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 2930 + }, + { + "epoch": 2.1562156215621564, + "grad_norm": 0.5121372938156128, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 2940 + }, + { + "epoch": 2.1635496883021634, + "grad_norm": 0.49424484372138977, + "learning_rate": 0.0002, + "loss": 0.8751, + "step": 2950 + }, + { + "epoch": 2.170883755042171, + "grad_norm": 0.6312560439109802, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2960 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.5235576629638672, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2970 + }, + { + "epoch": 2.1855518885221854, + "grad_norm": 0.5868439674377441, + "learning_rate": 0.0002, + "loss": 0.9706, + "step": 2980 + }, + { + "epoch": 2.192885955262193, + "grad_norm": 0.42302873730659485, + "learning_rate": 0.0002, + "loss": 0.9338, + "step": 2990 + }, + { + "epoch": 2.2002200220022003, + "grad_norm": 0.5097725987434387, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 3000 + }, + { + "epoch": 2.2075540887422074, + "grad_norm": 0.5091572403907776, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 3010 + }, + { + "epoch": 2.214888155482215, + "grad_norm": 0.49433162808418274, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 3020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5577368140220642, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 3030 + }, + { + "epoch": 2.2295562889622293, + "grad_norm": 0.6177583932876587, + "learning_rate": 0.0002, + "loss": 0.9033, + "step": 3040 + }, + { + "epoch": 2.236890355702237, + "grad_norm": 0.5256719589233398, + "learning_rate": 0.0002, + "loss": 0.9882, + "step": 3050 + }, + { + "epoch": 2.2442244224422443, + "grad_norm": 0.5001118183135986, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3060 + }, + { + "epoch": 2.2515584891822513, + "grad_norm": 0.5721249580383301, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3070 + }, + { + "epoch": 2.258892555922259, + "grad_norm": 0.5325384140014648, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 3080 + }, + { + "epoch": 2.2662266226622663, + "grad_norm": 0.5719189047813416, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 3090 + }, + { + "epoch": 2.2735606894022737, + "grad_norm": 0.6337835788726807, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 3100 + }, + { + "epoch": 2.2808947561422808, + "grad_norm": 0.5381836891174316, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 3110 + }, + { + "epoch": 2.2882288228822882, + "grad_norm": 0.5408531427383423, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 3120 + }, + { + "epoch": 2.2955628896222957, + "grad_norm": 0.43705281615257263, + "learning_rate": 0.0002, + "loss": 1.0325, + "step": 3130 + }, + { + "epoch": 2.3028969563623027, + "grad_norm": 0.6454030275344849, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 3140 + }, + { + "epoch": 2.31023102310231, + "grad_norm": 0.686030387878418, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 3150 + }, + { + "epoch": 2.3175650898423177, + "grad_norm": 0.5123633146286011, + "learning_rate": 0.0002, + "loss": 0.9403, + "step": 3160 + }, + { + "epoch": 2.3248991565823247, + "grad_norm": 0.842506468296051, + "learning_rate": 0.0002, + "loss": 0.8834, + "step": 3170 + }, + { + "epoch": 2.332233223322332, + "grad_norm": 0.5193818807601929, + "learning_rate": 0.0002, + "loss": 1.0497, + "step": 3180 + }, + { + "epoch": 2.3395672900623397, + "grad_norm": 0.5634409189224243, + "learning_rate": 0.0002, + "loss": 0.9473, + "step": 3190 + }, + { + "epoch": 2.3469013568023467, + "grad_norm": 0.6475534439086914, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 3200 + }, + { + "epoch": 2.354235423542354, + "grad_norm": 1.1503914594650269, + "learning_rate": 0.0002, + "loss": 0.874, + "step": 3210 + }, + { + "epoch": 2.3615694902823616, + "grad_norm": 0.7234905362129211, + "learning_rate": 0.0002, + "loss": 0.9762, + "step": 3220 + }, + { + "epoch": 2.368903557022369, + "grad_norm": 0.664903461933136, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 3230 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.5453006625175476, + "learning_rate": 0.0002, + "loss": 0.9987, + "step": 3240 + }, + { + "epoch": 2.3835716905023836, + "grad_norm": 0.6256654262542725, + "learning_rate": 0.0002, + "loss": 0.9742, + "step": 3250 + }, + { + "epoch": 2.390905757242391, + "grad_norm": 0.5166565179824829, + "learning_rate": 0.0002, + "loss": 0.9922, + "step": 3260 + }, + { + "epoch": 2.398239823982398, + "grad_norm": 0.5699098110198975, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 3270 + }, + { + "epoch": 2.4055738907224056, + "grad_norm": 0.4472540020942688, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 3280 + }, + { + "epoch": 2.412907957462413, + "grad_norm": 0.6790403127670288, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3290 + }, + { + "epoch": 2.42024202420242, + "grad_norm": 0.5182185173034668, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 3300 + }, + { + "epoch": 2.4275760909424275, + "grad_norm": 0.564647912979126, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 3310 + }, + { + "epoch": 2.434910157682435, + "grad_norm": 0.5625313520431519, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 3320 + }, + { + "epoch": 2.442244224422442, + "grad_norm": 0.7496559619903564, + "learning_rate": 0.0002, + "loss": 0.8798, + "step": 3330 + }, + { + "epoch": 2.4495782911624495, + "grad_norm": 0.4779128134250641, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 2.456912357902457, + "grad_norm": 0.578093409538269, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 3350 + }, + { + "epoch": 2.4642464246424645, + "grad_norm": 0.5456080436706543, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 3360 + }, + { + "epoch": 2.4715804913824715, + "grad_norm": 0.4769273102283478, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 3370 + }, + { + "epoch": 2.478914558122479, + "grad_norm": 0.5608189702033997, + "learning_rate": 0.0002, + "loss": 0.9312, + "step": 3380 + }, + { + "epoch": 2.4862486248624864, + "grad_norm": 0.5590165853500366, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 3390 + }, + { + "epoch": 2.4935826916024935, + "grad_norm": 0.801306962966919, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 3400 + }, + { + "epoch": 2.500916758342501, + "grad_norm": 0.6045624613761902, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 3410 + }, + { + "epoch": 2.5082508250825084, + "grad_norm": 0.5735858082771301, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 3420 + }, + { + "epoch": 2.5155848918225154, + "grad_norm": 0.6827309131622314, + "learning_rate": 0.0002, + "loss": 0.9846, + "step": 3430 + }, + { + "epoch": 2.522918958562523, + "grad_norm": 0.5702602863311768, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 3440 + }, + { + "epoch": 2.5302530253025304, + "grad_norm": 0.6674721240997314, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 3450 + }, + { + "epoch": 2.5375870920425374, + "grad_norm": 0.5635907649993896, + "learning_rate": 0.0002, + "loss": 0.914, + "step": 3460 + }, + { + "epoch": 2.544921158782545, + "grad_norm": 0.42737770080566406, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 3470 + }, + { + "epoch": 2.5522552255225524, + "grad_norm": 0.6720691919326782, + "learning_rate": 0.0002, + "loss": 0.9474, + "step": 3480 + }, + { + "epoch": 2.55958929226256, + "grad_norm": 0.8917084336280823, + "learning_rate": 0.0002, + "loss": 0.8637, + "step": 3490 + }, + { + "epoch": 2.566923359002567, + "grad_norm": 0.5134549140930176, + "learning_rate": 0.0002, + "loss": 0.9257, + "step": 3500 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.4951367974281311, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 3510 + }, + { + "epoch": 2.5815914924825814, + "grad_norm": 0.9438204765319824, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 3520 + }, + { + "epoch": 2.588925559222589, + "grad_norm": 0.6024714708328247, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 3530 + }, + { + "epoch": 2.5962596259625963, + "grad_norm": 0.5248535871505737, + "learning_rate": 0.0002, + "loss": 0.9298, + "step": 3540 + }, + { + "epoch": 2.6035936927026038, + "grad_norm": 0.8677568435668945, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 3550 + }, + { + "epoch": 2.610927759442611, + "grad_norm": 0.82008296251297, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 3560 + }, + { + "epoch": 2.6182618261826183, + "grad_norm": 0.4724634885787964, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 3570 + }, + { + "epoch": 2.6255958929226257, + "grad_norm": 0.5434244275093079, + "learning_rate": 0.0002, + "loss": 0.9058, + "step": 3580 + }, + { + "epoch": 2.6329299596626328, + "grad_norm": 0.4948740005493164, + "learning_rate": 0.0002, + "loss": 0.9379, + "step": 3590 + }, + { + "epoch": 2.6402640264026402, + "grad_norm": 0.42109328508377075, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3600 + }, + { + "epoch": 2.6475980931426477, + "grad_norm": 0.7979786396026611, + "learning_rate": 0.0002, + "loss": 0.9809, + "step": 3610 + }, + { + "epoch": 2.654932159882655, + "grad_norm": 0.6345919370651245, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 3620 + }, + { + "epoch": 2.662266226622662, + "grad_norm": 0.4971671402454376, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 3630 + }, + { + "epoch": 2.6696002933626697, + "grad_norm": 0.6467748284339905, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 3640 + }, + { + "epoch": 2.6769343601026767, + "grad_norm": 0.4240160286426544, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 3650 + }, + { + "epoch": 2.684268426842684, + "grad_norm": 0.5179754495620728, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3660 + }, + { + "epoch": 2.6916024935826917, + "grad_norm": 0.754012405872345, + "learning_rate": 0.0002, + "loss": 0.9221, + "step": 3670 + }, + { + "epoch": 2.698936560322699, + "grad_norm": 0.5141299962997437, + "learning_rate": 0.0002, + "loss": 0.9194, + "step": 3680 + }, + { + "epoch": 2.706270627062706, + "grad_norm": 0.5737819075584412, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 3690 + }, + { + "epoch": 2.7136046938027136, + "grad_norm": 0.5887577533721924, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 3700 + }, + { + "epoch": 2.720938760542721, + "grad_norm": 0.6740471720695496, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 3710 + }, + { + "epoch": 2.728272827282728, + "grad_norm": 0.5879453420639038, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 3720 + }, + { + "epoch": 2.7356068940227356, + "grad_norm": 0.4858354926109314, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 3730 + }, + { + "epoch": 2.742940960762743, + "grad_norm": 0.5489001870155334, + "learning_rate": 0.0002, + "loss": 0.9308, + "step": 3740 + }, + { + "epoch": 2.7502750275027505, + "grad_norm": 0.8187092542648315, + "learning_rate": 0.0002, + "loss": 0.894, + "step": 3750 + }, + { + "epoch": 2.7576090942427576, + "grad_norm": 0.5666626691818237, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 3760 + }, + { + "epoch": 2.764943160982765, + "grad_norm": 0.5377066135406494, + "learning_rate": 0.0002, + "loss": 1.0059, + "step": 3770 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.566330075263977, + "learning_rate": 0.0002, + "loss": 0.9132, + "step": 3780 + }, + { + "epoch": 2.7796112944627795, + "grad_norm": 0.5522832870483398, + "learning_rate": 0.0002, + "loss": 0.9415, + "step": 3790 + }, + { + "epoch": 2.786945361202787, + "grad_norm": 0.5668695569038391, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 3800 + }, + { + "epoch": 2.7942794279427945, + "grad_norm": 0.7566602826118469, + "learning_rate": 0.0002, + "loss": 0.8885, + "step": 3810 + }, + { + "epoch": 2.8016134946828015, + "grad_norm": 0.5603684782981873, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 3820 + }, + { + "epoch": 2.808947561422809, + "grad_norm": 0.49122217297554016, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 3830 + }, + { + "epoch": 2.816281628162816, + "grad_norm": 0.6798251867294312, + "learning_rate": 0.0002, + "loss": 0.9738, + "step": 3840 + }, + { + "epoch": 2.8236156949028235, + "grad_norm": 0.6097991466522217, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 3850 + }, + { + "epoch": 2.830949761642831, + "grad_norm": 0.6675726175308228, + "learning_rate": 0.0002, + "loss": 0.8672, + "step": 3860 + }, + { + "epoch": 2.8382838283828384, + "grad_norm": 0.9223952889442444, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 3870 + }, + { + "epoch": 2.8456178951228455, + "grad_norm": 0.6020799875259399, + "learning_rate": 0.0002, + "loss": 0.8767, + "step": 3880 + }, + { + "epoch": 2.852951961862853, + "grad_norm": 0.5206381678581238, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3890 + }, + { + "epoch": 2.8602860286028604, + "grad_norm": 0.6268777251243591, + "learning_rate": 0.0002, + "loss": 0.9479, + "step": 3900 + }, + { + "epoch": 2.8676200953428674, + "grad_norm": 1.1583497524261475, + "learning_rate": 0.0002, + "loss": 0.9409, + "step": 3910 + }, + { + "epoch": 2.874954162082875, + "grad_norm": 0.7263903021812439, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 3920 + }, + { + "epoch": 2.8822882288228824, + "grad_norm": 0.5369910001754761, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 3930 + }, + { + "epoch": 2.88962229556289, + "grad_norm": 0.7298350930213928, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 3940 + }, + { + "epoch": 2.896956362302897, + "grad_norm": 0.577012836933136, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 3950 + }, + { + "epoch": 2.9042904290429044, + "grad_norm": 0.5859594345092773, + "learning_rate": 0.0002, + "loss": 0.9716, + "step": 3960 + }, + { + "epoch": 2.9116244957829114, + "grad_norm": 0.47176122665405273, + "learning_rate": 0.0002, + "loss": 0.8772, + "step": 3970 + }, + { + "epoch": 2.918958562522919, + "grad_norm": 0.9699620604515076, + "learning_rate": 0.0002, + "loss": 0.8997, + "step": 3980 + }, + { + "epoch": 2.9262926292629263, + "grad_norm": 0.7908747792243958, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3990 + }, + { + "epoch": 2.933626696002934, + "grad_norm": 0.5777379274368286, + "learning_rate": 0.0002, + "loss": 0.9462, + "step": 4000 + }, + { + "epoch": 2.940960762742941, + "grad_norm": 0.599288284778595, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 4010 + }, + { + "epoch": 2.9482948294829483, + "grad_norm": 0.5232274532318115, + "learning_rate": 0.0002, + "loss": 0.9812, + "step": 4020 + }, + { + "epoch": 2.9556288962229558, + "grad_norm": 0.6395137310028076, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 4030 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.589260458946228, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 4040 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.5699581503868103, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 4050 + }, + { + "epoch": 2.9776310964429777, + "grad_norm": 0.528468132019043, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 4060 + }, + { + "epoch": 2.984965163182985, + "grad_norm": 0.4804670512676239, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 4070 + }, + { + "epoch": 2.9922992299229922, + "grad_norm": 1.1918889284133911, + "learning_rate": 0.0002, + "loss": 0.9771, + "step": 4080 + }, + { + "epoch": 2.9996332966629997, + "grad_norm": 0.5479103326797485, + "learning_rate": 0.0002, + "loss": 0.9178, + "step": 4090 + }, + { + "epoch": 2.9996332966629997, + "eval_loss": 1.1642853021621704, + "eval_runtime": 32.7511, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.649, + "step": 4090 + }, + { + "epoch": 3.006967363403007, + "grad_norm": 0.7430027723312378, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 4100 + }, + { + "epoch": 3.014301430143014, + "grad_norm": 0.6293647289276123, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4110 + }, + { + "epoch": 3.0216354968830217, + "grad_norm": 0.6191329956054688, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 4120 + }, + { + "epoch": 3.028969563623029, + "grad_norm": 0.7959313988685608, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4130 + }, + { + "epoch": 3.036303630363036, + "grad_norm": 0.5956351161003113, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 4140 + }, + { + "epoch": 3.0436376971030437, + "grad_norm": 0.670383632183075, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 4150 + }, + { + "epoch": 3.050971763843051, + "grad_norm": 0.6414518356323242, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 4160 + }, + { + "epoch": 3.058305830583058, + "grad_norm": 0.7928852438926697, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 4170 + }, + { + "epoch": 3.0656398973230656, + "grad_norm": 0.6211121082305908, + "learning_rate": 0.0002, + "loss": 0.7914, + "step": 4180 + }, + { + "epoch": 3.072973964063073, + "grad_norm": 0.6237057447433472, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 4190 + }, + { + "epoch": 3.08030803080308, + "grad_norm": 0.6522233486175537, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 4200 + }, + { + "epoch": 3.0876420975430876, + "grad_norm": 0.9396848678588867, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4210 + }, + { + "epoch": 3.094976164283095, + "grad_norm": 0.8003010749816895, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 4220 + }, + { + "epoch": 3.102310231023102, + "grad_norm": 0.6733810305595398, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 4230 + }, + { + "epoch": 3.1096442977631096, + "grad_norm": 0.6365828514099121, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 4240 + }, + { + "epoch": 3.116978364503117, + "grad_norm": 1.0805548429489136, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4250 + }, + { + "epoch": 3.1243124312431245, + "grad_norm": 0.7262141108512878, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4260 + }, + { + "epoch": 3.1316464979831315, + "grad_norm": 0.5500539541244507, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 4270 + }, + { + "epoch": 3.138980564723139, + "grad_norm": 0.793912947177887, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 4280 + }, + { + "epoch": 3.1463146314631465, + "grad_norm": 1.2540518045425415, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 4290 + }, + { + "epoch": 3.1536486982031535, + "grad_norm": 0.7020077705383301, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 4300 + }, + { + "epoch": 3.160982764943161, + "grad_norm": 0.5111123323440552, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 4310 + }, + { + "epoch": 3.1683168316831685, + "grad_norm": 0.7172090411186218, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 4320 + }, + { + "epoch": 3.1756508984231755, + "grad_norm": 0.6343168616294861, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 4330 + }, + { + "epoch": 3.182984965163183, + "grad_norm": 0.9563672542572021, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4340 + }, + { + "epoch": 3.1903190319031904, + "grad_norm": 1.0225574970245361, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4350 + }, + { + "epoch": 3.1976530986431975, + "grad_norm": 1.1633386611938477, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 4360 + }, + { + "epoch": 3.204987165383205, + "grad_norm": 0.8915148973464966, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 4370 + }, + { + "epoch": 3.2123212321232124, + "grad_norm": 0.9156812429428101, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4380 + }, + { + "epoch": 3.21965529886322, + "grad_norm": 0.6363258957862854, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 4390 + }, + { + "epoch": 3.226989365603227, + "grad_norm": 0.579099178314209, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 4400 + }, + { + "epoch": 3.2343234323432344, + "grad_norm": 0.8778146505355835, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 4410 + }, + { + "epoch": 3.241657499083242, + "grad_norm": 0.8356770873069763, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 4420 + }, + { + "epoch": 3.248991565823249, + "grad_norm": 0.702032208442688, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 4430 + }, + { + "epoch": 3.2563256325632564, + "grad_norm": 0.6386539340019226, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 4440 + }, + { + "epoch": 3.263659699303264, + "grad_norm": 0.7008408904075623, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 4450 + }, + { + "epoch": 3.270993766043271, + "grad_norm": 0.9556332230567932, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 4460 + }, + { + "epoch": 3.2783278327832783, + "grad_norm": 0.5667835474014282, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 4470 + }, + { + "epoch": 3.285661899523286, + "grad_norm": 0.8239172697067261, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 4480 + }, + { + "epoch": 3.292995966263293, + "grad_norm": 0.7045050859451294, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 4490 + }, + { + "epoch": 3.3003300330033003, + "grad_norm": 0.7131434082984924, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 4500 + }, + { + "epoch": 3.3076640997433078, + "grad_norm": 0.6924910545349121, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 4510 + }, + { + "epoch": 3.3149981664833152, + "grad_norm": 0.8945356607437134, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 4520 + }, + { + "epoch": 3.3223322332233223, + "grad_norm": 0.6546903252601624, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 4530 + }, + { + "epoch": 3.3296662999633297, + "grad_norm": 0.8206679224967957, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4540 + }, + { + "epoch": 3.3370003667033368, + "grad_norm": 0.6482203602790833, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 4550 + }, + { + "epoch": 3.3443344334433442, + "grad_norm": 0.7558760046958923, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 4560 + }, + { + "epoch": 3.3516685001833517, + "grad_norm": 0.7794756889343262, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 4570 + }, + { + "epoch": 3.359002566923359, + "grad_norm": 0.7382805943489075, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4580 + }, + { + "epoch": 3.366336633663366, + "grad_norm": 0.5912511944770813, + "learning_rate": 0.0002, + "loss": 0.8511, + "step": 4590 + }, + { + "epoch": 3.3736707004033737, + "grad_norm": 0.7444885969161987, + "learning_rate": 0.0002, + "loss": 0.8272, + "step": 4600 + }, + { + "epoch": 3.381004767143381, + "grad_norm": 0.7354922890663147, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 4610 + }, + { + "epoch": 3.388338833883388, + "grad_norm": 0.7685934901237488, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 4620 + }, + { + "epoch": 3.3956729006233957, + "grad_norm": 0.61041259765625, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 4630 + }, + { + "epoch": 3.403006967363403, + "grad_norm": 0.6820451021194458, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 4640 + }, + { + "epoch": 3.41034103410341, + "grad_norm": 0.5819534063339233, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 4650 + }, + { + "epoch": 3.4176751008434176, + "grad_norm": 0.705410897731781, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 4660 + }, + { + "epoch": 3.425009167583425, + "grad_norm": 0.8052892088890076, + "learning_rate": 0.0002, + "loss": 0.7901, + "step": 4670 + }, + { + "epoch": 3.432343234323432, + "grad_norm": 0.7746483087539673, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 4680 + }, + { + "epoch": 3.4396773010634396, + "grad_norm": 0.7713689804077148, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 4690 + }, + { + "epoch": 3.447011367803447, + "grad_norm": 0.810371994972229, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 4700 + }, + { + "epoch": 3.4543454345434546, + "grad_norm": 0.7702969312667847, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4710 + }, + { + "epoch": 3.4616795012834616, + "grad_norm": 0.7069268822669983, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4720 + }, + { + "epoch": 3.469013568023469, + "grad_norm": 0.7640359401702881, + "learning_rate": 0.0002, + "loss": 0.8199, + "step": 4730 + }, + { + "epoch": 3.4763476347634765, + "grad_norm": 0.8661707639694214, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 4740 + }, + { + "epoch": 3.4836817015034836, + "grad_norm": 0.9970282912254333, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 4750 + }, + { + "epoch": 3.491015768243491, + "grad_norm": 0.5824355483055115, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 4760 + }, + { + "epoch": 3.4983498349834985, + "grad_norm": 1.3072649240493774, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 4770 + }, + { + "epoch": 3.5056839017235055, + "grad_norm": 0.873978316783905, + "learning_rate": 0.0002, + "loss": 0.9101, + "step": 4780 + }, + { + "epoch": 3.513017968463513, + "grad_norm": 0.5526657104492188, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4790 + }, + { + "epoch": 3.5203520352035205, + "grad_norm": 0.790894627571106, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 4800 + }, + { + "epoch": 3.5276861019435275, + "grad_norm": 0.8119630217552185, + "learning_rate": 0.0002, + "loss": 0.831, + "step": 4810 + }, + { + "epoch": 3.535020168683535, + "grad_norm": 0.633212149143219, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 4820 + }, + { + "epoch": 3.5423542354235424, + "grad_norm": 0.703029990196228, + "learning_rate": 0.0002, + "loss": 0.8505, + "step": 4830 + }, + { + "epoch": 3.54968830216355, + "grad_norm": 0.7603771686553955, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 4840 + }, + { + "epoch": 3.557022368903557, + "grad_norm": 0.6260480880737305, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 4850 + }, + { + "epoch": 3.5643564356435644, + "grad_norm": 0.8203664422035217, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 4860 + }, + { + "epoch": 3.5716905023835714, + "grad_norm": 0.7793813347816467, + "learning_rate": 0.0002, + "loss": 0.8821, + "step": 4870 + }, + { + "epoch": 3.579024569123579, + "grad_norm": 0.7667397260665894, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 4880 + }, + { + "epoch": 3.5863586358635864, + "grad_norm": 0.8198829889297485, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 4890 + }, + { + "epoch": 3.593692702603594, + "grad_norm": 0.7689233422279358, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 4900 + }, + { + "epoch": 3.601026769343601, + "grad_norm": 0.7870983481407166, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 4910 + }, + { + "epoch": 3.6083608360836084, + "grad_norm": 0.8133853077888489, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 4920 + }, + { + "epoch": 3.615694902823616, + "grad_norm": 1.308401346206665, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 4930 + }, + { + "epoch": 3.623028969563623, + "grad_norm": 0.7131121754646301, + "learning_rate": 0.0002, + "loss": 0.8494, + "step": 4940 + }, + { + "epoch": 3.6303630363036303, + "grad_norm": 0.6825910210609436, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 4950 + }, + { + "epoch": 3.637697103043638, + "grad_norm": 0.7254678606987, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4960 + }, + { + "epoch": 3.6450311697836453, + "grad_norm": 0.8045085072517395, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4970 + }, + { + "epoch": 3.6523652365236523, + "grad_norm": 0.6991777420043945, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 4980 + }, + { + "epoch": 3.6596993032636598, + "grad_norm": 0.7804713249206543, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 4990 + }, + { + "epoch": 3.667033370003667, + "grad_norm": 0.8525708317756653, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 5000 + }, + { + "epoch": 3.6743674367436743, + "grad_norm": 0.7959994673728943, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 5010 + }, + { + "epoch": 3.6817015034836817, + "grad_norm": 0.8103628158569336, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5020 + }, + { + "epoch": 3.689035570223689, + "grad_norm": 0.7517836093902588, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 5030 + }, + { + "epoch": 3.6963696369636962, + "grad_norm": 0.6878514289855957, + "learning_rate": 0.0002, + "loss": 0.8375, + "step": 5040 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 1.2371820211410522, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 5050 + }, + { + "epoch": 3.711037770443711, + "grad_norm": 0.6567103862762451, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 5060 + }, + { + "epoch": 3.718371837183718, + "grad_norm": 1.1254922151565552, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 5070 + }, + { + "epoch": 3.7257059039237257, + "grad_norm": 0.6796132326126099, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 5080 + }, + { + "epoch": 3.733039970663733, + "grad_norm": 0.7285300493240356, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5090 + }, + { + "epoch": 3.7403740374037406, + "grad_norm": 0.8931500911712646, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 5100 + }, + { + "epoch": 3.7477081041437477, + "grad_norm": 0.6256856918334961, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 5110 + }, + { + "epoch": 3.755042170883755, + "grad_norm": 0.79310142993927, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5120 + }, + { + "epoch": 3.762376237623762, + "grad_norm": 0.6594041585922241, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 5130 + }, + { + "epoch": 3.7697103043637696, + "grad_norm": 0.7029327750205994, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 5140 + }, + { + "epoch": 3.777044371103777, + "grad_norm": 0.5880070328712463, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 5150 + }, + { + "epoch": 3.7843784378437846, + "grad_norm": 0.7578945159912109, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 5160 + }, + { + "epoch": 3.7917125045837916, + "grad_norm": 0.8276378512382507, + "learning_rate": 0.0002, + "loss": 0.8819, + "step": 5170 + }, + { + "epoch": 3.799046571323799, + "grad_norm": 0.7627953886985779, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 5180 + }, + { + "epoch": 3.806380638063806, + "grad_norm": 0.8169086575508118, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 5190 + }, + { + "epoch": 3.8137147048038136, + "grad_norm": 0.6605030298233032, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 5200 + }, + { + "epoch": 3.821048771543821, + "grad_norm": 0.5837286114692688, + "learning_rate": 0.0002, + "loss": 0.8804, + "step": 5210 + }, + { + "epoch": 3.8283828382838285, + "grad_norm": 1.2422157526016235, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 5220 + }, + { + "epoch": 3.8357169050238356, + "grad_norm": 0.6589220762252808, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 5230 + }, + { + "epoch": 3.843050971763843, + "grad_norm": 0.8567556142807007, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5240 + }, + { + "epoch": 3.8503850385038505, + "grad_norm": 0.6490627527236938, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 5250 + }, + { + "epoch": 3.8577191052438575, + "grad_norm": 0.620232880115509, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5260 + }, + { + "epoch": 3.865053171983865, + "grad_norm": 0.7685128450393677, + "learning_rate": 0.0002, + "loss": 0.9192, + "step": 5270 + }, + { + "epoch": 3.8723872387238725, + "grad_norm": 0.8113296627998352, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 5280 + }, + { + "epoch": 3.87972130546388, + "grad_norm": 0.8092675805091858, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 5290 + }, + { + "epoch": 3.887055372203887, + "grad_norm": 0.583570122718811, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 5300 + }, + { + "epoch": 3.8943894389438944, + "grad_norm": 1.712363600730896, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 5310 + }, + { + "epoch": 3.9017235056839015, + "grad_norm": 0.6673534512519836, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5320 + }, + { + "epoch": 3.909057572423909, + "grad_norm": 1.9770312309265137, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5330 + }, + { + "epoch": 3.9163916391639164, + "grad_norm": 0.6430999636650085, + "learning_rate": 0.0002, + "loss": 0.8793, + "step": 5340 + }, + { + "epoch": 3.923725705903924, + "grad_norm": 1.0159571170806885, + "learning_rate": 0.0002, + "loss": 0.839, + "step": 5350 + }, + { + "epoch": 3.931059772643931, + "grad_norm": 0.8607584834098816, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 5360 + }, + { + "epoch": 3.9383938393839384, + "grad_norm": 0.6967900991439819, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 5370 + }, + { + "epoch": 3.945727906123946, + "grad_norm": 0.7683077454566956, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 5380 + }, + { + "epoch": 3.953061972863953, + "grad_norm": 0.6805762648582458, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5390 + }, + { + "epoch": 3.9603960396039604, + "grad_norm": 0.7033619284629822, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5400 + }, + { + "epoch": 3.967730106343968, + "grad_norm": 0.966112494468689, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5410 + }, + { + "epoch": 3.9750641730839753, + "grad_norm": 0.8467881083488464, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 5420 + }, + { + "epoch": 3.9823982398239823, + "grad_norm": 0.8005317449569702, + "learning_rate": 0.0002, + "loss": 0.8084, + "step": 5430 + }, + { + "epoch": 3.98973230656399, + "grad_norm": 1.1615241765975952, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 5440 + }, + { + "epoch": 3.997066373303997, + "grad_norm": 0.6121614575386047, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 5450 + }, + { + "epoch": 4.0, + "eval_loss": 1.1834222078323364, + "eval_runtime": 32.7569, + "eval_samples_per_second": 13.158, + "eval_steps_per_second": 1.649, + "step": 5454 + }, + { + "epoch": 4.004400440044004, + "grad_norm": 0.6055727005004883, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 5460 + }, + { + "epoch": 4.011734506784012, + "grad_norm": 0.8232647180557251, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 5470 + }, + { + "epoch": 4.019068573524019, + "grad_norm": 0.7739192247390747, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 5480 + }, + { + "epoch": 4.026402640264027, + "grad_norm": 0.6264950633049011, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 5490 + }, + { + "epoch": 4.033736707004033, + "grad_norm": 1.4798702001571655, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 5500 + }, + { + "epoch": 4.041070773744041, + "grad_norm": 0.9538470506668091, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 5510 + }, + { + "epoch": 4.048404840484048, + "grad_norm": 0.834561288356781, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 5520 + }, + { + "epoch": 4.055738907224056, + "grad_norm": 0.6407850384712219, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 5530 + }, + { + "epoch": 4.063072973964063, + "grad_norm": 0.9035961627960205, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 5540 + }, + { + "epoch": 4.070407040704071, + "grad_norm": 0.842812716960907, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 5550 + }, + { + "epoch": 4.077741107444078, + "grad_norm": 0.8197882175445557, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 5560 + }, + { + "epoch": 4.085075174184085, + "grad_norm": 0.8652673959732056, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 5570 + }, + { + "epoch": 4.092409240924092, + "grad_norm": 0.8048318028450012, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 5580 + }, + { + "epoch": 4.0997433076641, + "grad_norm": 0.9604969024658203, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 5590 + }, + { + "epoch": 4.107077374404107, + "grad_norm": 1.244756817817688, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 5600 + }, + { + "epoch": 4.114411441144115, + "grad_norm": 0.7975269556045532, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 5610 + }, + { + "epoch": 4.121745507884122, + "grad_norm": 0.6130099296569824, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 5620 + }, + { + "epoch": 4.129079574624129, + "grad_norm": 0.7793202996253967, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 5630 + }, + { + "epoch": 4.136413641364136, + "grad_norm": 1.187238335609436, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 5640 + }, + { + "epoch": 4.143747708104144, + "grad_norm": 0.8450375199317932, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 5650 + }, + { + "epoch": 4.151081774844151, + "grad_norm": 0.9006940126419067, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5660 + }, + { + "epoch": 4.158415841584159, + "grad_norm": 0.9447154998779297, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 5670 + }, + { + "epoch": 4.165749908324166, + "grad_norm": 0.798032283782959, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 5680 + }, + { + "epoch": 4.1730839750641735, + "grad_norm": 0.65578693151474, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 5690 + }, + { + "epoch": 4.18041804180418, + "grad_norm": 1.0864700078964233, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 5700 + }, + { + "epoch": 4.187752108544188, + "grad_norm": 0.7344121932983398, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 5710 + }, + { + "epoch": 4.195086175284195, + "grad_norm": 0.9722456932067871, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 5720 + }, + { + "epoch": 4.2024202420242025, + "grad_norm": 1.263814926147461, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 5730 + }, + { + "epoch": 4.20975430876421, + "grad_norm": 0.9622581005096436, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 5740 + }, + { + "epoch": 4.2170883755042174, + "grad_norm": 0.8497143387794495, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 5750 + }, + { + "epoch": 4.224422442244224, + "grad_norm": 0.8248446583747864, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 5760 + }, + { + "epoch": 4.2317565089842315, + "grad_norm": 1.2544798851013184, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 5770 + }, + { + "epoch": 4.239090575724239, + "grad_norm": 0.8224676251411438, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 5780 + }, + { + "epoch": 4.2464246424642464, + "grad_norm": 0.8924877047538757, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 5790 + }, + { + "epoch": 4.253758709204254, + "grad_norm": 0.8545848727226257, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 5800 + }, + { + "epoch": 4.261092775944261, + "grad_norm": 0.8081067800521851, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 5810 + }, + { + "epoch": 4.268426842684269, + "grad_norm": 0.7111002802848816, + "learning_rate": 0.0002, + "loss": 0.6149, + "step": 5820 + }, + { + "epoch": 4.2757609094242754, + "grad_norm": 0.8696979880332947, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 5830 + }, + { + "epoch": 4.283094976164283, + "grad_norm": 0.821401834487915, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 5840 + }, + { + "epoch": 4.29042904290429, + "grad_norm": 0.888908326625824, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 5850 + }, + { + "epoch": 4.297763109644298, + "grad_norm": 1.9380123615264893, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 5860 + }, + { + "epoch": 4.305097176384305, + "grad_norm": 1.121774435043335, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 5870 + }, + { + "epoch": 4.312431243124313, + "grad_norm": 0.9238282442092896, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 5880 + }, + { + "epoch": 4.319765309864319, + "grad_norm": 0.7321620583534241, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 5890 + }, + { + "epoch": 4.327099376604327, + "grad_norm": 0.8739548325538635, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 5900 + }, + { + "epoch": 4.334433443344334, + "grad_norm": 0.9686012268066406, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 5910 + }, + { + "epoch": 4.341767510084342, + "grad_norm": 0.9033839106559753, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 5920 + }, + { + "epoch": 4.349101576824349, + "grad_norm": 0.8131115436553955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 5930 + }, + { + "epoch": 4.356435643564357, + "grad_norm": 0.8942412734031677, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 5940 + }, + { + "epoch": 4.363769710304364, + "grad_norm": 0.8439112901687622, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 5950 + }, + { + "epoch": 4.371103777044371, + "grad_norm": 0.9176713228225708, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 5960 + }, + { + "epoch": 4.378437843784378, + "grad_norm": 0.6799634695053101, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 5970 + }, + { + "epoch": 4.385771910524386, + "grad_norm": 1.0435824394226074, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5980 + }, + { + "epoch": 4.393105977264393, + "grad_norm": 0.997937798500061, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 5990 + }, + { + "epoch": 4.400440044004401, + "grad_norm": 1.0308842658996582, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 6000 + }, + { + "epoch": 4.407774110744408, + "grad_norm": 1.3683775663375854, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 6010 + }, + { + "epoch": 4.415108177484415, + "grad_norm": 0.7569534182548523, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 6020 + }, + { + "epoch": 4.422442244224422, + "grad_norm": 1.089978575706482, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 6030 + }, + { + "epoch": 4.42977631096443, + "grad_norm": 0.7522459626197815, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 6040 + }, + { + "epoch": 4.437110377704437, + "grad_norm": 0.6709823608398438, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 6050 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.6992089748382568, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 6060 + }, + { + "epoch": 4.451778511184452, + "grad_norm": 1.0182931423187256, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 6070 + }, + { + "epoch": 4.459112577924459, + "grad_norm": 1.0685160160064697, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 6080 + }, + { + "epoch": 4.466446644664466, + "grad_norm": 0.8295124769210815, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 6090 + }, + { + "epoch": 4.473780711404474, + "grad_norm": 1.1862998008728027, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6100 + }, + { + "epoch": 4.481114778144481, + "grad_norm": 0.7400273084640503, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 6110 + }, + { + "epoch": 4.488448844884489, + "grad_norm": 0.7098417282104492, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 6120 + }, + { + "epoch": 4.495782911624496, + "grad_norm": 0.9745053648948669, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 6130 + }, + { + "epoch": 4.503116978364503, + "grad_norm": 0.8638797998428345, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 6140 + }, + { + "epoch": 4.51045104510451, + "grad_norm": 0.8291046619415283, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6150 + }, + { + "epoch": 4.517785111844518, + "grad_norm": 1.0301737785339355, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 6160 + }, + { + "epoch": 4.525119178584525, + "grad_norm": 1.1996512413024902, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 6170 + }, + { + "epoch": 4.5324532453245325, + "grad_norm": 1.151038408279419, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 6180 + }, + { + "epoch": 4.53978731206454, + "grad_norm": 0.8385201096534729, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 6190 + }, + { + "epoch": 4.5471213788045475, + "grad_norm": 0.8969188332557678, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 6200 + }, + { + "epoch": 4.554455445544555, + "grad_norm": 1.60659658908844, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 6210 + }, + { + "epoch": 4.5617895122845615, + "grad_norm": 0.9356731176376343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 6220 + }, + { + "epoch": 4.569123579024569, + "grad_norm": 0.95856773853302, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 6230 + }, + { + "epoch": 4.5764576457645765, + "grad_norm": 1.1162524223327637, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 6240 + }, + { + "epoch": 4.583791712504584, + "grad_norm": 0.8809238076210022, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 6250 + }, + { + "epoch": 4.591125779244591, + "grad_norm": 0.890738844871521, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 6260 + }, + { + "epoch": 4.598459845984598, + "grad_norm": 0.918684720993042, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 6270 + }, + { + "epoch": 4.6057939127246055, + "grad_norm": 0.8156296610832214, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 6280 + }, + { + "epoch": 4.613127979464613, + "grad_norm": 1.046634316444397, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6290 + }, + { + "epoch": 4.62046204620462, + "grad_norm": 0.7725525498390198, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 6300 + }, + { + "epoch": 4.627796112944628, + "grad_norm": 0.9992046356201172, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 6310 + }, + { + "epoch": 4.635130179684635, + "grad_norm": 0.8480095267295837, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 6320 + }, + { + "epoch": 4.642464246424643, + "grad_norm": 0.7061955332756042, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 6330 + }, + { + "epoch": 4.649798313164649, + "grad_norm": 1.0354212522506714, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 6340 + }, + { + "epoch": 4.657132379904657, + "grad_norm": 1.0081377029418945, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6350 + }, + { + "epoch": 4.664466446644664, + "grad_norm": 1.2904249429702759, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 6360 + }, + { + "epoch": 4.671800513384672, + "grad_norm": 0.9248910546302795, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 6370 + }, + { + "epoch": 4.679134580124679, + "grad_norm": 0.9907804131507874, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 6380 + }, + { + "epoch": 4.686468646864687, + "grad_norm": 1.201143741607666, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 6390 + }, + { + "epoch": 4.693802713604693, + "grad_norm": 0.8709394335746765, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 6400 + }, + { + "epoch": 4.701136780344701, + "grad_norm": 0.7468608021736145, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 6410 + }, + { + "epoch": 4.708470847084708, + "grad_norm": 0.8607903718948364, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 6420 + }, + { + "epoch": 4.715804913824716, + "grad_norm": 0.9840512871742249, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 6430 + }, + { + "epoch": 4.723138980564723, + "grad_norm": 0.8328204154968262, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 6440 + }, + { + "epoch": 4.730473047304731, + "grad_norm": 0.924505352973938, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 6450 + }, + { + "epoch": 4.737807114044738, + "grad_norm": 0.8897685408592224, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 6460 + }, + { + "epoch": 4.745141180784745, + "grad_norm": 0.9605024456977844, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6470 + }, + { + "epoch": 4.752475247524752, + "grad_norm": 0.8150759935379028, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 6480 + }, + { + "epoch": 4.75980931426476, + "grad_norm": 0.8128412961959839, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 6490 + }, + { + "epoch": 4.767143381004767, + "grad_norm": 0.7381404638290405, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 6500 + }, + { + "epoch": 4.774477447744775, + "grad_norm": 1.0565853118896484, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 6510 + }, + { + "epoch": 4.781811514484782, + "grad_norm": 0.9298134446144104, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6520 + }, + { + "epoch": 4.789145581224789, + "grad_norm": 1.0145525932312012, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 6530 + }, + { + "epoch": 4.796479647964796, + "grad_norm": 0.92259681224823, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 6540 + }, + { + "epoch": 4.803813714704804, + "grad_norm": 0.7881024479866028, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 6550 + }, + { + "epoch": 4.811147781444811, + "grad_norm": 1.4935206174850464, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 6560 + }, + { + "epoch": 4.818481848184819, + "grad_norm": 0.8612369298934937, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 6570 + }, + { + "epoch": 4.825815914924826, + "grad_norm": 1.0118653774261475, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 6580 + }, + { + "epoch": 4.833149981664834, + "grad_norm": 1.1303809881210327, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 6590 + }, + { + "epoch": 4.84048404840484, + "grad_norm": 0.9112492203712463, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 6600 + }, + { + "epoch": 4.847818115144848, + "grad_norm": 0.864762544631958, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 6610 + }, + { + "epoch": 4.855152181884855, + "grad_norm": 0.9090572595596313, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 6620 + }, + { + "epoch": 4.862486248624863, + "grad_norm": 1.014953374862671, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 6630 + }, + { + "epoch": 4.86982031536487, + "grad_norm": 1.0702149868011475, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 6640 + }, + { + "epoch": 4.8771543821048775, + "grad_norm": 1.002135157585144, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 6650 + }, + { + "epoch": 4.884488448844884, + "grad_norm": 0.862545907497406, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 6660 + }, + { + "epoch": 4.891822515584892, + "grad_norm": 0.7302131056785583, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 6670 + }, + { + "epoch": 4.899156582324899, + "grad_norm": 0.8380730152130127, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 6680 + }, + { + "epoch": 4.9064906490649065, + "grad_norm": 0.7956018447875977, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 6690 + }, + { + "epoch": 4.913824715804914, + "grad_norm": 0.6717583537101746, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 6700 + }, + { + "epoch": 4.9211587825449215, + "grad_norm": 1.09099280834198, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 6710 + }, + { + "epoch": 4.928492849284929, + "grad_norm": 0.8589889407157898, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 6720 + }, + { + "epoch": 4.9358269160249355, + "grad_norm": 1.0046314001083374, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 6730 + }, + { + "epoch": 4.943160982764943, + "grad_norm": 0.8559659123420715, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 6740 + }, + { + "epoch": 4.9504950495049505, + "grad_norm": 0.8588525652885437, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 6750 + }, + { + "epoch": 4.957829116244958, + "grad_norm": 0.9192708134651184, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 6760 + }, + { + "epoch": 4.965163182984965, + "grad_norm": 1.051398754119873, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 6770 + }, + { + "epoch": 4.972497249724973, + "grad_norm": 0.9111362099647522, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 6780 + }, + { + "epoch": 4.9798313164649795, + "grad_norm": 0.7305638194084167, + "learning_rate": 0.0002, + "loss": 0.7613, + "step": 6790 + }, + { + "epoch": 4.987165383204987, + "grad_norm": 1.118837594985962, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 6800 + }, + { + "epoch": 4.994499449944994, + "grad_norm": 0.9075239300727844, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 6810 + }, + { + "epoch": 4.999633296662999, + "eval_loss": 1.2361247539520264, + "eval_runtime": 32.7325, + "eval_samples_per_second": 13.167, + "eval_steps_per_second": 1.65, + "step": 6817 + }, + { + "epoch": 5.001833516685002, + "grad_norm": 1.0541315078735352, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 6820 + }, + { + "epoch": 5.009167583425009, + "grad_norm": 0.9750140905380249, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 6830 + }, + { + "epoch": 5.016501650165017, + "grad_norm": 0.931838870048523, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 6840 + }, + { + "epoch": 5.023835716905023, + "grad_norm": 1.110278844833374, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 6850 + }, + { + "epoch": 5.031169783645031, + "grad_norm": 1.0670180320739746, + "learning_rate": 0.0002, + "loss": 0.4676, + "step": 6860 + }, + { + "epoch": 5.038503850385038, + "grad_norm": 0.8762092590332031, + "learning_rate": 0.0002, + "loss": 0.4374, + "step": 6870 + }, + { + "epoch": 5.045837917125046, + "grad_norm": 1.1169432401657104, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 6880 + }, + { + "epoch": 5.053171983865053, + "grad_norm": 1.005491018295288, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 6890 + }, + { + "epoch": 5.060506050605061, + "grad_norm": 1.1751841306686401, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 6900 + }, + { + "epoch": 5.067840117345068, + "grad_norm": 0.8501367568969727, + "learning_rate": 0.0002, + "loss": 0.451, + "step": 6910 + }, + { + "epoch": 5.075174184085075, + "grad_norm": 0.9795131683349609, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 6920 + }, + { + "epoch": 5.082508250825082, + "grad_norm": 0.8929879665374756, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 6930 + }, + { + "epoch": 5.08984231756509, + "grad_norm": 1.0156651735305786, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 6940 + }, + { + "epoch": 5.097176384305097, + "grad_norm": 1.0974335670471191, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 6950 + }, + { + "epoch": 5.104510451045105, + "grad_norm": 1.7015666961669922, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 6960 + }, + { + "epoch": 5.111844517785112, + "grad_norm": 1.0343226194381714, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 6970 + }, + { + "epoch": 5.119178584525119, + "grad_norm": 1.3072983026504517, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 6980 + }, + { + "epoch": 5.126512651265126, + "grad_norm": 1.038986086845398, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 6990 + }, + { + "epoch": 5.133846718005134, + "grad_norm": 0.8638386130332947, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 7000 + }, + { + "epoch": 5.141180784745141, + "grad_norm": 0.8326523900032043, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 7010 + }, + { + "epoch": 5.148514851485149, + "grad_norm": 1.0976895093917847, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 7020 + }, + { + "epoch": 5.155848918225156, + "grad_norm": 1.0077873468399048, + "learning_rate": 0.0002, + "loss": 0.4677, + "step": 7030 + }, + { + "epoch": 5.163182984965164, + "grad_norm": 1.0662257671356201, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 7040 + }, + { + "epoch": 5.17051705170517, + "grad_norm": 1.206271767616272, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 7050 + }, + { + "epoch": 5.177851118445178, + "grad_norm": 1.1990262269973755, + "learning_rate": 0.0002, + "loss": 0.4817, + "step": 7060 + }, + { + "epoch": 5.185185185185185, + "grad_norm": 1.0207163095474243, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 7070 + }, + { + "epoch": 5.192519251925193, + "grad_norm": 1.2783987522125244, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 7080 + }, + { + "epoch": 5.1998533186652, + "grad_norm": 1.1592512130737305, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 7090 + }, + { + "epoch": 5.2071873854052075, + "grad_norm": 1.1053160429000854, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 7100 + }, + { + "epoch": 5.214521452145214, + "grad_norm": 1.1925510168075562, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 7110 + }, + { + "epoch": 5.221855518885222, + "grad_norm": 1.0714877843856812, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 7120 + }, + { + "epoch": 5.229189585625229, + "grad_norm": 0.9451011419296265, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 7130 + }, + { + "epoch": 5.2365236523652365, + "grad_norm": 1.03838050365448, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 7140 + }, + { + "epoch": 5.243857719105244, + "grad_norm": 0.9204146265983582, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 7150 + }, + { + "epoch": 5.2511917858452515, + "grad_norm": 1.0142229795455933, + "learning_rate": 0.0002, + "loss": 0.5164, + "step": 7160 + }, + { + "epoch": 5.258525852585258, + "grad_norm": 1.4432005882263184, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 7170 + }, + { + "epoch": 5.2658599193252655, + "grad_norm": 1.1239633560180664, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 7180 + }, + { + "epoch": 5.273193986065273, + "grad_norm": 0.7012821435928345, + "learning_rate": 0.0002, + "loss": 0.4969, + "step": 7190 + }, + { + "epoch": 5.2805280528052805, + "grad_norm": 1.3499128818511963, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 7200 + }, + { + "epoch": 5.287862119545288, + "grad_norm": 0.9498730897903442, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 7210 + }, + { + "epoch": 5.295196186285295, + "grad_norm": 0.9552369117736816, + "learning_rate": 0.0002, + "loss": 0.5051, + "step": 7220 + }, + { + "epoch": 5.302530253025303, + "grad_norm": 0.7610348463058472, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 7230 + }, + { + "epoch": 5.3098643197653095, + "grad_norm": 1.0314512252807617, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 7240 + }, + { + "epoch": 5.317198386505317, + "grad_norm": 1.0534334182739258, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 7250 + }, + { + "epoch": 5.324532453245324, + "grad_norm": 1.2553406953811646, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 7260 + }, + { + "epoch": 5.331866519985332, + "grad_norm": 0.7061691880226135, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 7270 + }, + { + "epoch": 5.339200586725339, + "grad_norm": 0.9652578830718994, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 7280 + }, + { + "epoch": 5.346534653465347, + "grad_norm": 1.114788293838501, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 7290 + }, + { + "epoch": 5.353868720205353, + "grad_norm": 1.0940049886703491, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 7300 + }, + { + "epoch": 5.361202786945361, + "grad_norm": 1.0151008367538452, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 7310 + }, + { + "epoch": 5.368536853685368, + "grad_norm": 1.0369552373886108, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 7320 + }, + { + "epoch": 5.375870920425376, + "grad_norm": 0.8489866256713867, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 7330 + }, + { + "epoch": 5.383204987165383, + "grad_norm": 1.1031713485717773, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 7340 + }, + { + "epoch": 5.390539053905391, + "grad_norm": 0.9094716310501099, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 7350 + }, + { + "epoch": 5.397873120645398, + "grad_norm": 0.9530431032180786, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 7360 + }, + { + "epoch": 5.405207187385405, + "grad_norm": 0.9633604884147644, + "learning_rate": 0.0002, + "loss": 0.529, + "step": 7370 + }, + { + "epoch": 5.412541254125412, + "grad_norm": 0.9541662335395813, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 7380 + }, + { + "epoch": 5.41987532086542, + "grad_norm": 1.0459771156311035, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7390 + }, + { + "epoch": 5.427209387605427, + "grad_norm": 1.027388334274292, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 7400 + }, + { + "epoch": 5.434543454345435, + "grad_norm": 0.7267653346061707, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 7410 + }, + { + "epoch": 5.441877521085442, + "grad_norm": 1.020142674446106, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 7420 + }, + { + "epoch": 5.449211587825449, + "grad_norm": 1.044754147529602, + "learning_rate": 0.0002, + "loss": 0.4853, + "step": 7430 + }, + { + "epoch": 5.456545654565456, + "grad_norm": 1.5476195812225342, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 7440 + }, + { + "epoch": 5.463879721305464, + "grad_norm": 0.9879506826400757, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 7450 + }, + { + "epoch": 5.471213788045471, + "grad_norm": 1.2562980651855469, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 7460 + }, + { + "epoch": 5.478547854785479, + "grad_norm": 1.3051384687423706, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 7470 + }, + { + "epoch": 5.485881921525486, + "grad_norm": 1.0511597394943237, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 7480 + }, + { + "epoch": 5.493215988265494, + "grad_norm": 1.0380817651748657, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 7490 + }, + { + "epoch": 5.5005500550055, + "grad_norm": 1.170274257659912, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 7500 + }, + { + "epoch": 5.507884121745508, + "grad_norm": 1.3356517553329468, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 7510 + }, + { + "epoch": 5.515218188485515, + "grad_norm": 1.0727124214172363, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 7520 + }, + { + "epoch": 5.522552255225523, + "grad_norm": 1.0110199451446533, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 7530 + }, + { + "epoch": 5.52988632196553, + "grad_norm": 1.3086743354797363, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 7540 + }, + { + "epoch": 5.537220388705538, + "grad_norm": 1.1904916763305664, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 7550 + }, + { + "epoch": 5.544554455445544, + "grad_norm": 0.9466280937194824, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 7560 + }, + { + "epoch": 5.551888522185552, + "grad_norm": 1.1237901449203491, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 7570 + }, + { + "epoch": 5.559222588925559, + "grad_norm": 0.9590660333633423, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 7580 + }, + { + "epoch": 5.566556655665567, + "grad_norm": 1.0890778303146362, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 7590 + }, + { + "epoch": 5.573890722405574, + "grad_norm": 0.7206931114196777, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 7600 + }, + { + "epoch": 5.5812247891455815, + "grad_norm": 1.2884514331817627, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 7610 + }, + { + "epoch": 5.588558855885589, + "grad_norm": 0.7798039317131042, + "learning_rate": 0.0002, + "loss": 0.5279, + "step": 7620 + }, + { + "epoch": 5.595892922625596, + "grad_norm": 1.166046142578125, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 7630 + }, + { + "epoch": 5.603226989365603, + "grad_norm": 1.0150201320648193, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 7640 + }, + { + "epoch": 5.6105610561056105, + "grad_norm": 1.0449682474136353, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 7650 + }, + { + "epoch": 5.617895122845618, + "grad_norm": 0.9310530424118042, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 7660 + }, + { + "epoch": 5.6252291895856255, + "grad_norm": 0.9117933511734009, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 7670 + }, + { + "epoch": 5.632563256325633, + "grad_norm": 1.1475164890289307, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 7680 + }, + { + "epoch": 5.6398973230656395, + "grad_norm": 1.066809058189392, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 7690 + }, + { + "epoch": 5.647231389805647, + "grad_norm": 1.2834991216659546, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 7700 + }, + { + "epoch": 5.6545654565456545, + "grad_norm": 1.2245112657546997, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 7710 + }, + { + "epoch": 5.661899523285662, + "grad_norm": 1.1424106359481812, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 7720 + }, + { + "epoch": 5.669233590025669, + "grad_norm": 1.0673892498016357, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 7730 + }, + { + "epoch": 5.676567656765677, + "grad_norm": 1.4312121868133545, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 7740 + }, + { + "epoch": 5.683901723505684, + "grad_norm": 0.9976982474327087, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 7750 + }, + { + "epoch": 5.691235790245691, + "grad_norm": 0.9464678168296814, + "learning_rate": 0.0002, + "loss": 0.4855, + "step": 7760 + }, + { + "epoch": 5.698569856985698, + "grad_norm": 1.010995626449585, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 7770 + }, + { + "epoch": 5.705903923725706, + "grad_norm": 1.3787750005722046, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 7780 + }, + { + "epoch": 5.713237990465713, + "grad_norm": 1.020922303199768, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 7790 + }, + { + "epoch": 5.720572057205721, + "grad_norm": 0.9748636484146118, + "learning_rate": 0.0002, + "loss": 0.5337, + "step": 7800 + }, + { + "epoch": 5.727906123945728, + "grad_norm": 1.3077744245529175, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 7810 + }, + { + "epoch": 5.735240190685735, + "grad_norm": 1.4770057201385498, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 7820 + }, + { + "epoch": 5.742574257425742, + "grad_norm": 1.6349090337753296, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 7830 + }, + { + "epoch": 5.74990832416575, + "grad_norm": 0.9818630814552307, + "learning_rate": 0.0002, + "loss": 0.5056, + "step": 7840 + }, + { + "epoch": 5.757242390905757, + "grad_norm": 0.9659715890884399, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 7850 + }, + { + "epoch": 5.764576457645765, + "grad_norm": 0.9269950985908508, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 7860 + }, + { + "epoch": 5.771910524385772, + "grad_norm": 1.0099073648452759, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 7870 + }, + { + "epoch": 5.77924459112578, + "grad_norm": 0.9123615026473999, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 7880 + }, + { + "epoch": 5.786578657865786, + "grad_norm": 1.1542246341705322, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 7890 + }, + { + "epoch": 5.793912724605794, + "grad_norm": 1.0792022943496704, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 7900 + }, + { + "epoch": 5.801246791345801, + "grad_norm": 0.95615553855896, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 7910 + }, + { + "epoch": 5.808580858085809, + "grad_norm": 1.2471332550048828, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 7920 + }, + { + "epoch": 5.815914924825816, + "grad_norm": 1.0189851522445679, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 7930 + }, + { + "epoch": 5.823248991565823, + "grad_norm": 1.3309742212295532, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 7940 + }, + { + "epoch": 5.83058305830583, + "grad_norm": 1.2930549383163452, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 7950 + }, + { + "epoch": 5.837917125045838, + "grad_norm": 0.8216308951377869, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 7960 + }, + { + "epoch": 5.845251191785845, + "grad_norm": 1.1205775737762451, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 7970 + }, + { + "epoch": 5.852585258525853, + "grad_norm": 0.851298451423645, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 7980 + }, + { + "epoch": 5.85991932526586, + "grad_norm": 0.8797095417976379, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 7990 + }, + { + "epoch": 5.867253392005868, + "grad_norm": 1.5784614086151123, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 8000 + }, + { + "epoch": 5.874587458745875, + "grad_norm": 1.1531187295913696, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 8010 + }, + { + "epoch": 5.881921525485882, + "grad_norm": 1.2469146251678467, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 8020 + }, + { + "epoch": 5.889255592225889, + "grad_norm": 1.0784350633621216, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 8030 + }, + { + "epoch": 5.896589658965897, + "grad_norm": 1.1311599016189575, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 8040 + }, + { + "epoch": 5.903923725705904, + "grad_norm": 0.9654512405395508, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 8050 + }, + { + "epoch": 5.9112577924459115, + "grad_norm": 1.3288270235061646, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 8060 + }, + { + "epoch": 5.918591859185918, + "grad_norm": 1.12800931930542, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 8070 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 0.9449917674064636, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 8080 + }, + { + "epoch": 5.933259992665933, + "grad_norm": 1.1532357931137085, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 8090 + }, + { + "epoch": 5.9405940594059405, + "grad_norm": 1.2211151123046875, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 8100 + }, + { + "epoch": 5.947928126145948, + "grad_norm": 1.3459105491638184, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 8110 + }, + { + "epoch": 5.9552621928859555, + "grad_norm": 1.251999855041504, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 8120 + }, + { + "epoch": 5.962596259625963, + "grad_norm": 1.5682506561279297, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 8130 + }, + { + "epoch": 5.9699303263659695, + "grad_norm": 0.926075279712677, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 8140 + }, + { + "epoch": 5.977264393105977, + "grad_norm": 0.9622511863708496, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 8150 + }, + { + "epoch": 5.9845984598459845, + "grad_norm": 0.9633373618125916, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 8160 + }, + { + "epoch": 5.991932526585992, + "grad_norm": 0.8960476517677307, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 8170 + }, + { + "epoch": 5.999266593325999, + "grad_norm": 0.9372805953025818, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 8180 + }, + { + "epoch": 6.0, + "eval_loss": 1.3233846426010132, + "eval_runtime": 32.7419, + "eval_samples_per_second": 13.164, + "eval_steps_per_second": 1.649, + "step": 8181 + } + ], + "logging_steps": 10, + "max_steps": 10904, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.785983619990815e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-8181/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/README.md b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7cf9e959e32046d22d776998db9f62da2f301a35 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9bf2e18a4874146b707384d0e34b6a5ffdda8564101e38b2f64f81beb762de +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7de281b3259db1c7a5f003c28543f8476618c91d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c69583b3ea1be11ba1035d9f27d924ff5ebff08b05184c101a63cee86e38850 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2c0afe783aec92385b20bf53b88de93cf5800656 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdfd0044ec7097c0370428a56f4983cc513ae1f01d46173d06bfb7c1a890bdb6 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c18e8ea45f03d57945d8cedf736c7cb140071a1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14322ec31c8207dffb4e78821ba511f674f7d5cc97a6efc3327c3ce666090d22 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..178d72b936e4bf0bc804a7530ac341e5d22400f0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/trainer_state.json @@ -0,0 +1,6767 @@ +{ + "best_metric": 1.1534006595611572, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", + "epoch": 6.999633296662999, + "eval_steps": 10, + "global_step": 9544, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007334066740007334, + "grad_norm": 0.47521963715553284, + "learning_rate": 0.0002, + "loss": 1.9722, + "step": 10 + }, + { + "epoch": 0.014668133480014669, + "grad_norm": 0.5395162105560303, + "learning_rate": 0.0002, + "loss": 1.4821, + "step": 20 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 0.4305780231952667, + "learning_rate": 0.0002, + "loss": 1.4202, + "step": 30 + }, + { + "epoch": 0.029336266960029337, + "grad_norm": 0.6938246488571167, + "learning_rate": 0.0002, + "loss": 1.4271, + "step": 40 + }, + { + "epoch": 0.03667033370003667, + "grad_norm": 1.5133819580078125, + "learning_rate": 0.0002, + "loss": 1.3112, + "step": 50 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 0.9173883199691772, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 60 + }, + { + "epoch": 0.05133846718005134, + "grad_norm": 0.4619861841201782, + "learning_rate": 0.0002, + "loss": 1.2844, + "step": 70 + }, + { + "epoch": 0.058672533920058674, + "grad_norm": 0.46118637919425964, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 80 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 0.4468648135662079, + "learning_rate": 0.0002, + "loss": 1.3441, + "step": 90 + }, + { + "epoch": 0.07334066740007333, + "grad_norm": 0.46123769879341125, + "learning_rate": 0.0002, + "loss": 1.1863, + "step": 100 + }, + { + "epoch": 0.08067473414008068, + "grad_norm": 0.4859139025211334, + "learning_rate": 0.0002, + "loss": 1.2772, + "step": 110 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 0.4384922385215759, + "learning_rate": 0.0002, + "loss": 1.2087, + "step": 120 + }, + { + "epoch": 0.09534286762009535, + "grad_norm": 0.39519360661506653, + "learning_rate": 0.0002, + "loss": 1.2927, + "step": 130 + }, + { + "epoch": 0.10267693436010268, + "grad_norm": 0.4049859344959259, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 140 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 0.4605638086795807, + "learning_rate": 0.0002, + "loss": 1.293, + "step": 150 + }, + { + "epoch": 0.11734506784011735, + "grad_norm": 0.4201928377151489, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 160 + }, + { + "epoch": 0.12467913458012468, + "grad_norm": 0.5367777347564697, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 170 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 0.41752299666404724, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 180 + }, + { + "epoch": 0.13934726806013933, + "grad_norm": 0.31597763299942017, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 190 + }, + { + "epoch": 0.14668133480014667, + "grad_norm": 0.7468788623809814, + "learning_rate": 0.0002, + "loss": 1.2441, + "step": 200 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 0.3403034508228302, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 210 + }, + { + "epoch": 0.16134946828016136, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0002, + "loss": 1.2439, + "step": 220 + }, + { + "epoch": 0.1686835350201687, + "grad_norm": 0.356158971786499, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 230 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 0.3448857367038727, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 240 + }, + { + "epoch": 0.18335166850018336, + "grad_norm": 0.3475699722766876, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 250 + }, + { + "epoch": 0.1906857352401907, + "grad_norm": 0.2770358622074127, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 260 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.4310270845890045, + "learning_rate": 0.0002, + "loss": 1.2238, + "step": 270 + }, + { + "epoch": 0.20535386872020536, + "grad_norm": 0.335041880607605, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 280 + }, + { + "epoch": 0.2126879354602127, + "grad_norm": 0.3420602083206177, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 290 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 0.325001060962677, + "learning_rate": 0.0002, + "loss": 1.1232, + "step": 300 + }, + { + "epoch": 0.22735606894022736, + "grad_norm": 0.3027827739715576, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 310 + }, + { + "epoch": 0.2346901356802347, + "grad_norm": 0.435550719499588, + "learning_rate": 0.0002, + "loss": 1.1803, + "step": 320 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 0.3884522616863251, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 330 + }, + { + "epoch": 0.24935826916024936, + "grad_norm": 0.7736002206802368, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 340 + }, + { + "epoch": 0.2566923359002567, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0002, + "loss": 1.3606, + "step": 350 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 0.3311890959739685, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 360 + }, + { + "epoch": 0.27136046938027136, + "grad_norm": 0.7473500370979309, + "learning_rate": 0.0002, + "loss": 1.2219, + "step": 370 + }, + { + "epoch": 0.27869453612027867, + "grad_norm": 0.3681875765323639, + "learning_rate": 0.0002, + "loss": 1.2712, + "step": 380 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 0.3764737844467163, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 390 + }, + { + "epoch": 0.29336266960029334, + "grad_norm": 0.4243989586830139, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 400 + }, + { + "epoch": 0.3006967363403007, + "grad_norm": 0.2658531963825226, + "learning_rate": 0.0002, + "loss": 1.199, + "step": 410 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 0.3436793386936188, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.31536486982031536, + "grad_norm": 0.5101129412651062, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 430 + }, + { + "epoch": 0.3226989365603227, + "grad_norm": 0.3319750726222992, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 440 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 0.385148286819458, + "learning_rate": 0.0002, + "loss": 1.1804, + "step": 450 + }, + { + "epoch": 0.3373670700403374, + "grad_norm": 0.3477935791015625, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 460 + }, + { + "epoch": 0.3447011367803447, + "grad_norm": 0.29748716950416565, + "learning_rate": 0.0002, + "loss": 1.1877, + "step": 470 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 0.34083324670791626, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 480 + }, + { + "epoch": 0.35936927026035936, + "grad_norm": 0.36904552578926086, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 490 + }, + { + "epoch": 0.3667033370003667, + "grad_norm": 0.315483033657074, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 500 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.44897955656051636, + "learning_rate": 0.0002, + "loss": 1.1461, + "step": 510 + }, + { + "epoch": 0.3813714704803814, + "grad_norm": 0.3160701394081116, + "learning_rate": 0.0002, + "loss": 1.3035, + "step": 520 + }, + { + "epoch": 0.3887055372203887, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0002, + "loss": 1.3197, + "step": 530 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.5430002808570862, + "learning_rate": 0.0002, + "loss": 1.2983, + "step": 540 + }, + { + "epoch": 0.40337367070040336, + "grad_norm": 0.2908070683479309, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 550 + }, + { + "epoch": 0.4107077374404107, + "grad_norm": 0.35066530108451843, + "learning_rate": 0.0002, + "loss": 1.2384, + "step": 560 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.37588003277778625, + "learning_rate": 0.0002, + "loss": 1.1784, + "step": 570 + }, + { + "epoch": 0.4253758709204254, + "grad_norm": 0.3112126886844635, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 580 + }, + { + "epoch": 0.4327099376604327, + "grad_norm": 0.35577139258384705, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 590 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.31706422567367554, + "learning_rate": 0.0002, + "loss": 1.184, + "step": 600 + }, + { + "epoch": 0.44737807114044736, + "grad_norm": 0.3249092102050781, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 610 + }, + { + "epoch": 0.4547121378804547, + "grad_norm": 0.3842705488204956, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 620 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 0.390991747379303, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 630 + }, + { + "epoch": 0.4693802713604694, + "grad_norm": 0.27532413601875305, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 640 + }, + { + "epoch": 0.4767143381004767, + "grad_norm": 0.31412816047668457, + "learning_rate": 0.0002, + "loss": 1.1058, + "step": 650 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.32117101550102234, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 660 + }, + { + "epoch": 0.49138247158049136, + "grad_norm": 0.3810010254383087, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 670 + }, + { + "epoch": 0.4987165383204987, + "grad_norm": 0.36289164423942566, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 680 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.34458720684051514, + "learning_rate": 0.0002, + "loss": 1.2034, + "step": 690 + }, + { + "epoch": 0.5133846718005134, + "grad_norm": 0.32844600081443787, + "learning_rate": 0.0002, + "loss": 1.1756, + "step": 700 + }, + { + "epoch": 0.5207187385405208, + "grad_norm": 0.3144175708293915, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 710 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.3898887634277344, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 720 + }, + { + "epoch": 0.5353868720205354, + "grad_norm": 1.3220758438110352, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 730 + }, + { + "epoch": 0.5427209387605427, + "grad_norm": 0.3635874390602112, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 740 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.3138217628002167, + "learning_rate": 0.0002, + "loss": 1.2169, + "step": 750 + }, + { + "epoch": 0.5573890722405573, + "grad_norm": 0.4063207805156708, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 760 + }, + { + "epoch": 0.5647231389805647, + "grad_norm": 0.3926219940185547, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 770 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.31954652070999146, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 780 + }, + { + "epoch": 0.5793912724605794, + "grad_norm": 0.4248711168766022, + "learning_rate": 0.0002, + "loss": 1.2977, + "step": 790 + }, + { + "epoch": 0.5867253392005867, + "grad_norm": 0.643004834651947, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 800 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.3479592800140381, + "learning_rate": 0.0002, + "loss": 1.1793, + "step": 810 + }, + { + "epoch": 0.6013934726806014, + "grad_norm": 0.4684754014015198, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 820 + }, + { + "epoch": 0.6087275394206088, + "grad_norm": 0.3739790916442871, + "learning_rate": 0.0002, + "loss": 1.2002, + "step": 830 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.40884748101234436, + "learning_rate": 0.0002, + "loss": 1.2139, + "step": 840 + }, + { + "epoch": 0.6233956729006234, + "grad_norm": 0.9722164273262024, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 850 + }, + { + "epoch": 0.6307297396406307, + "grad_norm": 0.42992347478866577, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 860 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.36654195189476013, + "learning_rate": 0.0002, + "loss": 1.1339, + "step": 870 + }, + { + "epoch": 0.6453978731206454, + "grad_norm": 0.4113832116127014, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 880 + }, + { + "epoch": 0.6527319398606527, + "grad_norm": 0.2948838770389557, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 890 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.38330280780792236, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 900 + }, + { + "epoch": 0.6674000733406674, + "grad_norm": 0.4428867697715759, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 910 + }, + { + "epoch": 0.6747341400806748, + "grad_norm": 0.23659265041351318, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 920 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.323685884475708, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 930 + }, + { + "epoch": 0.6894022735606894, + "grad_norm": 0.39157727360725403, + "learning_rate": 0.0002, + "loss": 1.0853, + "step": 940 + }, + { + "epoch": 0.6967363403006968, + "grad_norm": 0.27189481258392334, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 950 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.529883861541748, + "learning_rate": 0.0002, + "loss": 1.1033, + "step": 960 + }, + { + "epoch": 0.7114044737807114, + "grad_norm": 0.34758689999580383, + "learning_rate": 0.0002, + "loss": 1.139, + "step": 970 + }, + { + "epoch": 0.7187385405207187, + "grad_norm": 0.831749439239502, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 980 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.4438304007053375, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 990 + }, + { + "epoch": 0.7334066740007334, + "grad_norm": 0.33840006589889526, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 1000 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3454797863960266, + "learning_rate": 0.0002, + "loss": 1.254, + "step": 1010 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.38999441266059875, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 1020 + }, + { + "epoch": 0.7554088742207554, + "grad_norm": 0.2829911708831787, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1030 + }, + { + "epoch": 0.7627429409607628, + "grad_norm": 0.36918163299560547, + "learning_rate": 0.0002, + "loss": 1.2123, + "step": 1040 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3415680229663849, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 1050 + }, + { + "epoch": 0.7774110744407774, + "grad_norm": 0.2974182963371277, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 1060 + }, + { + "epoch": 0.7847451411807848, + "grad_norm": 0.3880919814109802, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 1070 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.33503302931785583, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 1080 + }, + { + "epoch": 0.7994132746607994, + "grad_norm": 0.3728407025337219, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 1090 + }, + { + "epoch": 0.8067473414008067, + "grad_norm": 0.3509373664855957, + "learning_rate": 0.0002, + "loss": 1.0835, + "step": 1100 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.42228564620018005, + "learning_rate": 0.0002, + "loss": 1.2661, + "step": 1110 + }, + { + "epoch": 0.8214154748808215, + "grad_norm": 0.313467800617218, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 1120 + }, + { + "epoch": 0.8287495416208287, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0002, + "loss": 1.1971, + "step": 1130 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.43200382590293884, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 1140 + }, + { + "epoch": 0.8434176751008434, + "grad_norm": 0.3309599459171295, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 1150 + }, + { + "epoch": 0.8507517418408508, + "grad_norm": 0.3526846170425415, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1160 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 1.2722247838974, + "learning_rate": 0.0002, + "loss": 1.0851, + "step": 1170 + }, + { + "epoch": 0.8654198753208654, + "grad_norm": 0.34142059087753296, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 1180 + }, + { + "epoch": 0.8727539420608728, + "grad_norm": 0.3805823028087616, + "learning_rate": 0.0002, + "loss": 1.2187, + "step": 1190 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.3931232690811157, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 1200 + }, + { + "epoch": 0.8874220755408874, + "grad_norm": 0.2937372624874115, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 1210 + }, + { + "epoch": 0.8947561422808947, + "grad_norm": 0.3757196366786957, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 1220 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.3502705991268158, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 1230 + }, + { + "epoch": 0.9094242757609095, + "grad_norm": 0.32758915424346924, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 1240 + }, + { + "epoch": 0.9167583425009168, + "grad_norm": 0.37199416756629944, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 1250 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.3551490604877472, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 1260 + }, + { + "epoch": 0.9314264759809314, + "grad_norm": 0.2859550714492798, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 1270 + }, + { + "epoch": 0.9387605427209388, + "grad_norm": 0.427990585565567, + "learning_rate": 0.0002, + "loss": 1.2186, + "step": 1280 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.33717992901802063, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 1290 + }, + { + "epoch": 0.9534286762009534, + "grad_norm": 0.30225634574890137, + "learning_rate": 0.0002, + "loss": 1.1656, + "step": 1300 + }, + { + "epoch": 0.9607627429409608, + "grad_norm": 0.385821133852005, + "learning_rate": 0.0002, + "loss": 1.2404, + "step": 1310 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.0002, + "loss": 1.1932, + "step": 1320 + }, + { + "epoch": 0.9754308764209755, + "grad_norm": 0.49987098574638367, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 1330 + }, + { + "epoch": 0.9827649431609827, + "grad_norm": 0.3842747211456299, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 1340 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.6274653673171997, + "learning_rate": 0.0002, + "loss": 1.0862, + "step": 1350 + }, + { + "epoch": 0.9974330766409975, + "grad_norm": 0.5239808559417725, + "learning_rate": 0.0002, + "loss": 1.124, + "step": 1360 + }, + { + "epoch": 0.9996332966629996, + "eval_loss": 1.1822267770767212, + "eval_runtime": 32.7389, + "eval_samples_per_second": 13.165, + "eval_steps_per_second": 1.649, + "step": 1363 + }, + { + "epoch": 1.0047671433810048, + "grad_norm": 0.45311301946640015, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 1370 + }, + { + "epoch": 1.012101210121012, + "grad_norm": 0.29685574769973755, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1380 + }, + { + "epoch": 1.0194352768610195, + "grad_norm": 0.3290937840938568, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 1390 + }, + { + "epoch": 1.0267693436010268, + "grad_norm": 0.3801758587360382, + "learning_rate": 0.0002, + "loss": 1.0295, + "step": 1400 + }, + { + "epoch": 1.034103410341034, + "grad_norm": 0.794174313545227, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 1410 + }, + { + "epoch": 1.0414374770810415, + "grad_norm": 0.3854154646396637, + "learning_rate": 0.0002, + "loss": 1.2232, + "step": 1420 + }, + { + "epoch": 1.0487715438210488, + "grad_norm": 0.32702451944351196, + "learning_rate": 0.0002, + "loss": 1.0652, + "step": 1430 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 0.7815203666687012, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 1440 + }, + { + "epoch": 1.0634396773010635, + "grad_norm": 0.3087436854839325, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 1450 + }, + { + "epoch": 1.0707737440410707, + "grad_norm": 0.3847602903842926, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 1460 + }, + { + "epoch": 1.0781078107810782, + "grad_norm": 0.3693031370639801, + "learning_rate": 0.0002, + "loss": 1.1428, + "step": 1470 + }, + { + "epoch": 1.0854418775210855, + "grad_norm": 0.4111202359199524, + "learning_rate": 0.0002, + "loss": 1.0995, + "step": 1480 + }, + { + "epoch": 1.0927759442610927, + "grad_norm": 0.41452381014823914, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 1490 + }, + { + "epoch": 1.1001100110011002, + "grad_norm": 0.3336445093154907, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 1500 + }, + { + "epoch": 1.1074440777411074, + "grad_norm": 0.3923407793045044, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 1510 + }, + { + "epoch": 1.1147781444811147, + "grad_norm": 0.46215683221817017, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 1520 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.3592156767845154, + "learning_rate": 0.0002, + "loss": 1.1133, + "step": 1530 + }, + { + "epoch": 1.1294462779611294, + "grad_norm": 0.361110657453537, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 1540 + }, + { + "epoch": 1.1367803447011369, + "grad_norm": 0.5317131280899048, + "learning_rate": 0.0002, + "loss": 1.1553, + "step": 1550 + }, + { + "epoch": 1.1441144114411441, + "grad_norm": 0.3882388174533844, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 1560 + }, + { + "epoch": 1.1514484781811514, + "grad_norm": 0.3259428143501282, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 1570 + }, + { + "epoch": 1.1587825449211588, + "grad_norm": 0.410935640335083, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 1580 + }, + { + "epoch": 1.166116611661166, + "grad_norm": 0.44940185546875, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 1590 + }, + { + "epoch": 1.1734506784011733, + "grad_norm": 0.5106484293937683, + "learning_rate": 0.0002, + "loss": 1.0334, + "step": 1600 + }, + { + "epoch": 1.1807847451411808, + "grad_norm": 0.6603665947914124, + "learning_rate": 0.0002, + "loss": 1.2376, + "step": 1610 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.4799964129924774, + "learning_rate": 0.0002, + "loss": 1.1227, + "step": 1620 + }, + { + "epoch": 1.1954528786211955, + "grad_norm": 0.4389883279800415, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 1630 + }, + { + "epoch": 1.2027869453612028, + "grad_norm": 0.4188813269138336, + "learning_rate": 0.0002, + "loss": 1.0667, + "step": 1640 + }, + { + "epoch": 1.21012101210121, + "grad_norm": 0.7132157683372498, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 1650 + }, + { + "epoch": 1.2174550788412175, + "grad_norm": 0.507480263710022, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 1660 + }, + { + "epoch": 1.2247891455812248, + "grad_norm": 0.9452332854270935, + "learning_rate": 0.0002, + "loss": 0.9948, + "step": 1670 + }, + { + "epoch": 1.2321232123212322, + "grad_norm": 0.4121614992618561, + "learning_rate": 0.0002, + "loss": 1.0228, + "step": 1680 + }, + { + "epoch": 1.2394572790612395, + "grad_norm": 0.34230247139930725, + "learning_rate": 0.0002, + "loss": 1.0366, + "step": 1690 + }, + { + "epoch": 1.2467913458012467, + "grad_norm": 0.4026208817958832, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 1700 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.46673697233200073, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 1710 + }, + { + "epoch": 1.2614594792812615, + "grad_norm": 0.38349825143814087, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 1720 + }, + { + "epoch": 1.2687935460212687, + "grad_norm": 0.4049997627735138, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 1730 + }, + { + "epoch": 1.2761276127612762, + "grad_norm": 0.3417615294456482, + "learning_rate": 0.0002, + "loss": 0.9504, + "step": 1740 + }, + { + "epoch": 1.2834616795012834, + "grad_norm": 0.4277614951133728, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 1750 + }, + { + "epoch": 1.2907957462412907, + "grad_norm": 0.5864202976226807, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 1760 + }, + { + "epoch": 1.2981298129812981, + "grad_norm": 0.7097493410110474, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 1770 + }, + { + "epoch": 1.3054638797213054, + "grad_norm": 0.3145381212234497, + "learning_rate": 0.0002, + "loss": 1.1132, + "step": 1780 + }, + { + "epoch": 1.3127979464613129, + "grad_norm": 0.5116165280342102, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 1790 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 0.7469736337661743, + "learning_rate": 0.0002, + "loss": 1.0765, + "step": 1800 + }, + { + "epoch": 1.3274660799413276, + "grad_norm": 0.32272255420684814, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 1810 + }, + { + "epoch": 1.3348001466813348, + "grad_norm": 0.3534623086452484, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 1820 + }, + { + "epoch": 1.342134213421342, + "grad_norm": 0.36127907037734985, + "learning_rate": 0.0002, + "loss": 1.1628, + "step": 1830 + }, + { + "epoch": 1.3494682801613496, + "grad_norm": 0.4072401523590088, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 1840 + }, + { + "epoch": 1.3568023469013568, + "grad_norm": 0.3769161105155945, + "learning_rate": 0.0002, + "loss": 1.1267, + "step": 1850 + }, + { + "epoch": 1.364136413641364, + "grad_norm": 0.412883460521698, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 1860 + }, + { + "epoch": 1.3714704803813715, + "grad_norm": 0.3735875189304352, + "learning_rate": 0.0002, + "loss": 1.0265, + "step": 1870 + }, + { + "epoch": 1.3788045471213788, + "grad_norm": 0.39158159494400024, + "learning_rate": 0.0002, + "loss": 1.1061, + "step": 1880 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.44431769847869873, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 1890 + }, + { + "epoch": 1.3934726806013935, + "grad_norm": 0.37772801518440247, + "learning_rate": 0.0002, + "loss": 1.0216, + "step": 1900 + }, + { + "epoch": 1.4008067473414008, + "grad_norm": 0.4056641757488251, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 1910 + }, + { + "epoch": 1.408140814081408, + "grad_norm": 0.41612377762794495, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 1920 + }, + { + "epoch": 1.4154748808214155, + "grad_norm": 0.41153013706207275, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 1930 + }, + { + "epoch": 1.4228089475614227, + "grad_norm": 0.387845516204834, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 1940 + }, + { + "epoch": 1.4301430143014302, + "grad_norm": 0.3809587061405182, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 1950 + }, + { + "epoch": 1.4374770810414375, + "grad_norm": 0.3625726103782654, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1960 + }, + { + "epoch": 1.444811147781445, + "grad_norm": 0.5294290781021118, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1970 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 0.39975494146347046, + "learning_rate": 0.0002, + "loss": 1.1114, + "step": 1980 + }, + { + "epoch": 1.4594792812614594, + "grad_norm": 0.4181167185306549, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 1990 + }, + { + "epoch": 1.466813348001467, + "grad_norm": 0.42001503705978394, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 2000 + }, + { + "epoch": 1.4741474147414741, + "grad_norm": 0.4877578616142273, + "learning_rate": 0.0002, + "loss": 1.1266, + "step": 2010 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.4050969183444977, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 2020 + }, + { + "epoch": 1.4888155482214889, + "grad_norm": 0.39068883657455444, + "learning_rate": 0.0002, + "loss": 1.0562, + "step": 2030 + }, + { + "epoch": 1.4961496149614961, + "grad_norm": 0.421282559633255, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 2040 + }, + { + "epoch": 1.5034836817015034, + "grad_norm": 0.47092297673225403, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 2050 + }, + { + "epoch": 1.5108177484415108, + "grad_norm": 0.39688974618911743, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 2060 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.5529879331588745, + "learning_rate": 0.0002, + "loss": 1.08, + "step": 2070 + }, + { + "epoch": 1.5254858819215253, + "grad_norm": 0.4879782199859619, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 2080 + }, + { + "epoch": 1.5328199486615328, + "grad_norm": 0.5517361164093018, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 2090 + }, + { + "epoch": 1.5401540154015403, + "grad_norm": 0.44015637040138245, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 2100 + }, + { + "epoch": 1.5474880821415475, + "grad_norm": 0.5435167551040649, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 2110 + }, + { + "epoch": 1.5548221488815548, + "grad_norm": 0.5714033246040344, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 2120 + }, + { + "epoch": 1.5621562156215623, + "grad_norm": 0.31732529401779175, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 2130 + }, + { + "epoch": 1.5694902823615695, + "grad_norm": 0.49068278074264526, + "learning_rate": 0.0002, + "loss": 1.0817, + "step": 2140 + }, + { + "epoch": 1.5768243491015768, + "grad_norm": 0.46851542592048645, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 2150 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.5083092451095581, + "learning_rate": 0.0002, + "loss": 1.0623, + "step": 2160 + }, + { + "epoch": 1.5914924825815915, + "grad_norm": 0.9822936058044434, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 2170 + }, + { + "epoch": 1.5988265493215987, + "grad_norm": 0.4575989246368408, + "learning_rate": 0.0002, + "loss": 0.9986, + "step": 2180 + }, + { + "epoch": 1.6061606160616062, + "grad_norm": 0.47444286942481995, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 2190 + }, + { + "epoch": 1.6134946828016135, + "grad_norm": 0.7208226919174194, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 2200 + }, + { + "epoch": 1.6208287495416207, + "grad_norm": 0.43791481852531433, + "learning_rate": 0.0002, + "loss": 1.15, + "step": 2210 + }, + { + "epoch": 1.6281628162816282, + "grad_norm": 0.5245792865753174, + "learning_rate": 0.0002, + "loss": 1.0961, + "step": 2220 + }, + { + "epoch": 1.6354968830216357, + "grad_norm": 0.39289429783821106, + "learning_rate": 0.0002, + "loss": 0.9957, + "step": 2230 + }, + { + "epoch": 1.6428309497616427, + "grad_norm": 0.6106135845184326, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 2240 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.3722580671310425, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 2250 + }, + { + "epoch": 1.6574990832416576, + "grad_norm": 0.3649403750896454, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2260 + }, + { + "epoch": 1.6648331499816649, + "grad_norm": 0.46514248847961426, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 2270 + }, + { + "epoch": 1.6721672167216721, + "grad_norm": 0.42034927010536194, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 2280 + }, + { + "epoch": 1.6795012834616796, + "grad_norm": 0.45202910900115967, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 2290 + }, + { + "epoch": 1.6868353502016868, + "grad_norm": 0.36257603764533997, + "learning_rate": 0.0002, + "loss": 1.0866, + "step": 2300 + }, + { + "epoch": 1.694169416941694, + "grad_norm": 0.6340323090553284, + "learning_rate": 0.0002, + "loss": 1.0973, + "step": 2310 + }, + { + "epoch": 1.7015034836817016, + "grad_norm": 0.4352878928184509, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 2320 + }, + { + "epoch": 1.7088375504217088, + "grad_norm": 0.45029792189598083, + "learning_rate": 0.0002, + "loss": 1.0629, + "step": 2330 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.3891315758228302, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 2340 + }, + { + "epoch": 1.7235056839017235, + "grad_norm": 0.35180050134658813, + "learning_rate": 0.0002, + "loss": 0.9779, + "step": 2350 + }, + { + "epoch": 1.7308397506417308, + "grad_norm": 0.42367449402809143, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 2360 + }, + { + "epoch": 1.738173817381738, + "grad_norm": 0.4553675353527069, + "learning_rate": 0.0002, + "loss": 1.0376, + "step": 2370 + }, + { + "epoch": 1.7455078841217455, + "grad_norm": 0.5944654941558838, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 2380 + }, + { + "epoch": 1.752841950861753, + "grad_norm": 0.3479664623737335, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 2390 + }, + { + "epoch": 1.76017601760176, + "grad_norm": 0.3585502505302429, + "learning_rate": 0.0002, + "loss": 1.0798, + "step": 2400 + }, + { + "epoch": 1.7675100843417675, + "grad_norm": 0.4263346493244171, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 2410 + }, + { + "epoch": 1.774844151081775, + "grad_norm": 0.5476409196853638, + "learning_rate": 0.0002, + "loss": 1.054, + "step": 2420 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 0.3694186508655548, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 2430 + }, + { + "epoch": 1.7895122845617895, + "grad_norm": 0.9185658693313599, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 2440 + }, + { + "epoch": 1.796846351301797, + "grad_norm": 0.7171908020973206, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 2450 + }, + { + "epoch": 1.8041804180418042, + "grad_norm": 0.550658643245697, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 2460 + }, + { + "epoch": 1.8115144847818114, + "grad_norm": 0.4075568914413452, + "learning_rate": 0.0002, + "loss": 0.9975, + "step": 2470 + }, + { + "epoch": 1.818848551521819, + "grad_norm": 0.3790127635002136, + "learning_rate": 0.0002, + "loss": 1.0935, + "step": 2480 + }, + { + "epoch": 1.8261826182618262, + "grad_norm": 0.3576384484767914, + "learning_rate": 0.0002, + "loss": 0.9839, + "step": 2490 + }, + { + "epoch": 1.8335166850018334, + "grad_norm": 0.3919370770454407, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 2500 + }, + { + "epoch": 1.8408507517418409, + "grad_norm": 0.485083669424057, + "learning_rate": 0.0002, + "loss": 0.9985, + "step": 2510 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.4564347565174103, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 2520 + }, + { + "epoch": 1.8555188852218554, + "grad_norm": 0.3613106608390808, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 2530 + }, + { + "epoch": 1.8628529519618628, + "grad_norm": 0.39600759744644165, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 2540 + }, + { + "epoch": 1.8701870187018703, + "grad_norm": 1.123499870300293, + "learning_rate": 0.0002, + "loss": 0.9453, + "step": 2550 + }, + { + "epoch": 1.8775210854418776, + "grad_norm": 0.4612680673599243, + "learning_rate": 0.0002, + "loss": 1.0635, + "step": 2560 + }, + { + "epoch": 1.8848551521818848, + "grad_norm": 0.42745399475097656, + "learning_rate": 0.0002, + "loss": 1.0087, + "step": 2570 + }, + { + "epoch": 1.8921892189218923, + "grad_norm": 0.4055580198764801, + "learning_rate": 0.0002, + "loss": 1.0102, + "step": 2580 + }, + { + "epoch": 1.8995232856618995, + "grad_norm": 0.44174644351005554, + "learning_rate": 0.0002, + "loss": 1.0177, + "step": 2590 + }, + { + "epoch": 1.9068573524019068, + "grad_norm": 1.0228385925292969, + "learning_rate": 0.0002, + "loss": 0.9886, + "step": 2600 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.3496396243572235, + "learning_rate": 0.0002, + "loss": 1.0857, + "step": 2610 + }, + { + "epoch": 1.9215254858819215, + "grad_norm": 0.4191173017024994, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 2620 + }, + { + "epoch": 1.9288595526219288, + "grad_norm": 0.6778554916381836, + "learning_rate": 0.0002, + "loss": 1.0943, + "step": 2630 + }, + { + "epoch": 1.9361936193619362, + "grad_norm": 0.41992834210395813, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 2640 + }, + { + "epoch": 1.9435276861019435, + "grad_norm": 0.8760401010513306, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 2650 + }, + { + "epoch": 1.9508617528419507, + "grad_norm": 0.44049209356307983, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 2660 + }, + { + "epoch": 1.9581958195819582, + "grad_norm": 0.5651928782463074, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 2670 + }, + { + "epoch": 1.9655298863219657, + "grad_norm": 0.5292727947235107, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 2680 + }, + { + "epoch": 1.9728639530619727, + "grad_norm": 0.6012240648269653, + "learning_rate": 0.0002, + "loss": 1.1328, + "step": 2690 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.3945149779319763, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 2700 + }, + { + "epoch": 1.9875320865419877, + "grad_norm": 0.5732627511024475, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 2710 + }, + { + "epoch": 1.994866153281995, + "grad_norm": 0.3963361084461212, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 2720 + }, + { + "epoch": 2.0, + "eval_loss": 1.1534006595611572, + "eval_runtime": 32.7541, + "eval_samples_per_second": 13.159, + "eval_steps_per_second": 1.649, + "step": 2727 + }, + { + "epoch": 2.002200220022002, + "grad_norm": 0.48628315329551697, + "learning_rate": 0.0002, + "loss": 0.9624, + "step": 2730 + }, + { + "epoch": 2.0095342867620096, + "grad_norm": 0.413875013589859, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 2740 + }, + { + "epoch": 2.0168683535020167, + "grad_norm": 0.4988735616207123, + "learning_rate": 0.0002, + "loss": 0.965, + "step": 2750 + }, + { + "epoch": 2.024202420242024, + "grad_norm": 0.5634812712669373, + "learning_rate": 0.0002, + "loss": 0.9677, + "step": 2760 + }, + { + "epoch": 2.0315364869820316, + "grad_norm": 0.48302653431892395, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 2770 + }, + { + "epoch": 2.038870553722039, + "grad_norm": 0.49914175271987915, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 2780 + }, + { + "epoch": 2.046204620462046, + "grad_norm": 1.14039945602417, + "learning_rate": 0.0002, + "loss": 0.904, + "step": 2790 + }, + { + "epoch": 2.0535386872020536, + "grad_norm": 0.6359720826148987, + "learning_rate": 0.0002, + "loss": 0.9588, + "step": 2800 + }, + { + "epoch": 2.060872753942061, + "grad_norm": 0.4589158296585083, + "learning_rate": 0.0002, + "loss": 0.9031, + "step": 2810 + }, + { + "epoch": 2.068206820682068, + "grad_norm": 0.46255481243133545, + "learning_rate": 0.0002, + "loss": 0.9438, + "step": 2820 + }, + { + "epoch": 2.0755408874220755, + "grad_norm": 0.6232137680053711, + "learning_rate": 0.0002, + "loss": 0.9464, + "step": 2830 + }, + { + "epoch": 2.082874954162083, + "grad_norm": 0.41042178869247437, + "learning_rate": 0.0002, + "loss": 0.8978, + "step": 2840 + }, + { + "epoch": 2.09020902090209, + "grad_norm": 0.5334428548812866, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 2850 + }, + { + "epoch": 2.0975430876420975, + "grad_norm": 0.8270058631896973, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 2860 + }, + { + "epoch": 2.104877154382105, + "grad_norm": 0.6624533534049988, + "learning_rate": 0.0002, + "loss": 1.0064, + "step": 2870 + }, + { + "epoch": 2.112211221122112, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0002, + "loss": 0.9196, + "step": 2880 + }, + { + "epoch": 2.1195452878621195, + "grad_norm": 0.621482789516449, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 2890 + }, + { + "epoch": 2.126879354602127, + "grad_norm": 0.4556255340576172, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2900 + }, + { + "epoch": 2.1342134213421344, + "grad_norm": 0.4620579183101654, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 2910 + }, + { + "epoch": 2.1415474880821415, + "grad_norm": 0.9602415561676025, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2920 + }, + { + "epoch": 2.148881554822149, + "grad_norm": 0.587943971157074, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 2930 + }, + { + "epoch": 2.1562156215621564, + "grad_norm": 0.5121372938156128, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 2940 + }, + { + "epoch": 2.1635496883021634, + "grad_norm": 0.49424484372138977, + "learning_rate": 0.0002, + "loss": 0.8751, + "step": 2950 + }, + { + "epoch": 2.170883755042171, + "grad_norm": 0.6312560439109802, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2960 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.5235576629638672, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2970 + }, + { + "epoch": 2.1855518885221854, + "grad_norm": 0.5868439674377441, + "learning_rate": 0.0002, + "loss": 0.9706, + "step": 2980 + }, + { + "epoch": 2.192885955262193, + "grad_norm": 0.42302873730659485, + "learning_rate": 0.0002, + "loss": 0.9338, + "step": 2990 + }, + { + "epoch": 2.2002200220022003, + "grad_norm": 0.5097725987434387, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 3000 + }, + { + "epoch": 2.2075540887422074, + "grad_norm": 0.5091572403907776, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 3010 + }, + { + "epoch": 2.214888155482215, + "grad_norm": 0.49433162808418274, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 3020 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5577368140220642, + "learning_rate": 0.0002, + "loss": 0.9734, + "step": 3030 + }, + { + "epoch": 2.2295562889622293, + "grad_norm": 0.6177583932876587, + "learning_rate": 0.0002, + "loss": 0.9033, + "step": 3040 + }, + { + "epoch": 2.236890355702237, + "grad_norm": 0.5256719589233398, + "learning_rate": 0.0002, + "loss": 0.9882, + "step": 3050 + }, + { + "epoch": 2.2442244224422443, + "grad_norm": 0.5001118183135986, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3060 + }, + { + "epoch": 2.2515584891822513, + "grad_norm": 0.5721249580383301, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3070 + }, + { + "epoch": 2.258892555922259, + "grad_norm": 0.5325384140014648, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 3080 + }, + { + "epoch": 2.2662266226622663, + "grad_norm": 0.5719189047813416, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 3090 + }, + { + "epoch": 2.2735606894022737, + "grad_norm": 0.6337835788726807, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 3100 + }, + { + "epoch": 2.2808947561422808, + "grad_norm": 0.5381836891174316, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 3110 + }, + { + "epoch": 2.2882288228822882, + "grad_norm": 0.5408531427383423, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 3120 + }, + { + "epoch": 2.2955628896222957, + "grad_norm": 0.43705281615257263, + "learning_rate": 0.0002, + "loss": 1.0325, + "step": 3130 + }, + { + "epoch": 2.3028969563623027, + "grad_norm": 0.6454030275344849, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 3140 + }, + { + "epoch": 2.31023102310231, + "grad_norm": 0.686030387878418, + "learning_rate": 0.0002, + "loss": 0.954, + "step": 3150 + }, + { + "epoch": 2.3175650898423177, + "grad_norm": 0.5123633146286011, + "learning_rate": 0.0002, + "loss": 0.9403, + "step": 3160 + }, + { + "epoch": 2.3248991565823247, + "grad_norm": 0.842506468296051, + "learning_rate": 0.0002, + "loss": 0.8834, + "step": 3170 + }, + { + "epoch": 2.332233223322332, + "grad_norm": 0.5193818807601929, + "learning_rate": 0.0002, + "loss": 1.0497, + "step": 3180 + }, + { + "epoch": 2.3395672900623397, + "grad_norm": 0.5634409189224243, + "learning_rate": 0.0002, + "loss": 0.9473, + "step": 3190 + }, + { + "epoch": 2.3469013568023467, + "grad_norm": 0.6475534439086914, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 3200 + }, + { + "epoch": 2.354235423542354, + "grad_norm": 1.1503914594650269, + "learning_rate": 0.0002, + "loss": 0.874, + "step": 3210 + }, + { + "epoch": 2.3615694902823616, + "grad_norm": 0.7234905362129211, + "learning_rate": 0.0002, + "loss": 0.9762, + "step": 3220 + }, + { + "epoch": 2.368903557022369, + "grad_norm": 0.664903461933136, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 3230 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.5453006625175476, + "learning_rate": 0.0002, + "loss": 0.9987, + "step": 3240 + }, + { + "epoch": 2.3835716905023836, + "grad_norm": 0.6256654262542725, + "learning_rate": 0.0002, + "loss": 0.9742, + "step": 3250 + }, + { + "epoch": 2.390905757242391, + "grad_norm": 0.5166565179824829, + "learning_rate": 0.0002, + "loss": 0.9922, + "step": 3260 + }, + { + "epoch": 2.398239823982398, + "grad_norm": 0.5699098110198975, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 3270 + }, + { + "epoch": 2.4055738907224056, + "grad_norm": 0.4472540020942688, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 3280 + }, + { + "epoch": 2.412907957462413, + "grad_norm": 0.6790403127670288, + "learning_rate": 0.0002, + "loss": 0.9439, + "step": 3290 + }, + { + "epoch": 2.42024202420242, + "grad_norm": 0.5182185173034668, + "learning_rate": 0.0002, + "loss": 0.972, + "step": 3300 + }, + { + "epoch": 2.4275760909424275, + "grad_norm": 0.564647912979126, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 3310 + }, + { + "epoch": 2.434910157682435, + "grad_norm": 0.5625313520431519, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 3320 + }, + { + "epoch": 2.442244224422442, + "grad_norm": 0.7496559619903564, + "learning_rate": 0.0002, + "loss": 0.8798, + "step": 3330 + }, + { + "epoch": 2.4495782911624495, + "grad_norm": 0.4779128134250641, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 2.456912357902457, + "grad_norm": 0.578093409538269, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 3350 + }, + { + "epoch": 2.4642464246424645, + "grad_norm": 0.5456080436706543, + "learning_rate": 0.0002, + "loss": 0.9282, + "step": 3360 + }, + { + "epoch": 2.4715804913824715, + "grad_norm": 0.4769273102283478, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 3370 + }, + { + "epoch": 2.478914558122479, + "grad_norm": 0.5608189702033997, + "learning_rate": 0.0002, + "loss": 0.9312, + "step": 3380 + }, + { + "epoch": 2.4862486248624864, + "grad_norm": 0.5590165853500366, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 3390 + }, + { + "epoch": 2.4935826916024935, + "grad_norm": 0.801306962966919, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 3400 + }, + { + "epoch": 2.500916758342501, + "grad_norm": 0.6045624613761902, + "learning_rate": 0.0002, + "loss": 0.9049, + "step": 3410 + }, + { + "epoch": 2.5082508250825084, + "grad_norm": 0.5735858082771301, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 3420 + }, + { + "epoch": 2.5155848918225154, + "grad_norm": 0.6827309131622314, + "learning_rate": 0.0002, + "loss": 0.9846, + "step": 3430 + }, + { + "epoch": 2.522918958562523, + "grad_norm": 0.5702602863311768, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 3440 + }, + { + "epoch": 2.5302530253025304, + "grad_norm": 0.6674721240997314, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 3450 + }, + { + "epoch": 2.5375870920425374, + "grad_norm": 0.5635907649993896, + "learning_rate": 0.0002, + "loss": 0.914, + "step": 3460 + }, + { + "epoch": 2.544921158782545, + "grad_norm": 0.42737770080566406, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 3470 + }, + { + "epoch": 2.5522552255225524, + "grad_norm": 0.6720691919326782, + "learning_rate": 0.0002, + "loss": 0.9474, + "step": 3480 + }, + { + "epoch": 2.55958929226256, + "grad_norm": 0.8917084336280823, + "learning_rate": 0.0002, + "loss": 0.8637, + "step": 3490 + }, + { + "epoch": 2.566923359002567, + "grad_norm": 0.5134549140930176, + "learning_rate": 0.0002, + "loss": 0.9257, + "step": 3500 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.4951367974281311, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 3510 + }, + { + "epoch": 2.5815914924825814, + "grad_norm": 0.9438204765319824, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 3520 + }, + { + "epoch": 2.588925559222589, + "grad_norm": 0.6024714708328247, + "learning_rate": 0.0002, + "loss": 0.8939, + "step": 3530 + }, + { + "epoch": 2.5962596259625963, + "grad_norm": 0.5248535871505737, + "learning_rate": 0.0002, + "loss": 0.9298, + "step": 3540 + }, + { + "epoch": 2.6035936927026038, + "grad_norm": 0.8677568435668945, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 3550 + }, + { + "epoch": 2.610927759442611, + "grad_norm": 0.82008296251297, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 3560 + }, + { + "epoch": 2.6182618261826183, + "grad_norm": 0.4724634885787964, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 3570 + }, + { + "epoch": 2.6255958929226257, + "grad_norm": 0.5434244275093079, + "learning_rate": 0.0002, + "loss": 0.9058, + "step": 3580 + }, + { + "epoch": 2.6329299596626328, + "grad_norm": 0.4948740005493164, + "learning_rate": 0.0002, + "loss": 0.9379, + "step": 3590 + }, + { + "epoch": 2.6402640264026402, + "grad_norm": 0.42109328508377075, + "learning_rate": 0.0002, + "loss": 0.8718, + "step": 3600 + }, + { + "epoch": 2.6475980931426477, + "grad_norm": 0.7979786396026611, + "learning_rate": 0.0002, + "loss": 0.9809, + "step": 3610 + }, + { + "epoch": 2.654932159882655, + "grad_norm": 0.6345919370651245, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 3620 + }, + { + "epoch": 2.662266226622662, + "grad_norm": 0.4971671402454376, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 3630 + }, + { + "epoch": 2.6696002933626697, + "grad_norm": 0.6467748284339905, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 3640 + }, + { + "epoch": 2.6769343601026767, + "grad_norm": 0.4240160286426544, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 3650 + }, + { + "epoch": 2.684268426842684, + "grad_norm": 0.5179754495620728, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3660 + }, + { + "epoch": 2.6916024935826917, + "grad_norm": 0.754012405872345, + "learning_rate": 0.0002, + "loss": 0.9221, + "step": 3670 + }, + { + "epoch": 2.698936560322699, + "grad_norm": 0.5141299962997437, + "learning_rate": 0.0002, + "loss": 0.9194, + "step": 3680 + }, + { + "epoch": 2.706270627062706, + "grad_norm": 0.5737819075584412, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 3690 + }, + { + "epoch": 2.7136046938027136, + "grad_norm": 0.5887577533721924, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 3700 + }, + { + "epoch": 2.720938760542721, + "grad_norm": 0.6740471720695496, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 3710 + }, + { + "epoch": 2.728272827282728, + "grad_norm": 0.5879453420639038, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 3720 + }, + { + "epoch": 2.7356068940227356, + "grad_norm": 0.4858354926109314, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 3730 + }, + { + "epoch": 2.742940960762743, + "grad_norm": 0.5489001870155334, + "learning_rate": 0.0002, + "loss": 0.9308, + "step": 3740 + }, + { + "epoch": 2.7502750275027505, + "grad_norm": 0.8187092542648315, + "learning_rate": 0.0002, + "loss": 0.894, + "step": 3750 + }, + { + "epoch": 2.7576090942427576, + "grad_norm": 0.5666626691818237, + "learning_rate": 0.0002, + "loss": 0.8954, + "step": 3760 + }, + { + "epoch": 2.764943160982765, + "grad_norm": 0.5377066135406494, + "learning_rate": 0.0002, + "loss": 1.0059, + "step": 3770 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.566330075263977, + "learning_rate": 0.0002, + "loss": 0.9132, + "step": 3780 + }, + { + "epoch": 2.7796112944627795, + "grad_norm": 0.5522832870483398, + "learning_rate": 0.0002, + "loss": 0.9415, + "step": 3790 + }, + { + "epoch": 2.786945361202787, + "grad_norm": 0.5668695569038391, + "learning_rate": 0.0002, + "loss": 0.8816, + "step": 3800 + }, + { + "epoch": 2.7942794279427945, + "grad_norm": 0.7566602826118469, + "learning_rate": 0.0002, + "loss": 0.8885, + "step": 3810 + }, + { + "epoch": 2.8016134946828015, + "grad_norm": 0.5603684782981873, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 3820 + }, + { + "epoch": 2.808947561422809, + "grad_norm": 0.49122217297554016, + "learning_rate": 0.0002, + "loss": 0.9602, + "step": 3830 + }, + { + "epoch": 2.816281628162816, + "grad_norm": 0.6798251867294312, + "learning_rate": 0.0002, + "loss": 0.9738, + "step": 3840 + }, + { + "epoch": 2.8236156949028235, + "grad_norm": 0.6097991466522217, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 3850 + }, + { + "epoch": 2.830949761642831, + "grad_norm": 0.6675726175308228, + "learning_rate": 0.0002, + "loss": 0.8672, + "step": 3860 + }, + { + "epoch": 2.8382838283828384, + "grad_norm": 0.9223952889442444, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 3870 + }, + { + "epoch": 2.8456178951228455, + "grad_norm": 0.6020799875259399, + "learning_rate": 0.0002, + "loss": 0.8767, + "step": 3880 + }, + { + "epoch": 2.852951961862853, + "grad_norm": 0.5206381678581238, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3890 + }, + { + "epoch": 2.8602860286028604, + "grad_norm": 0.6268777251243591, + "learning_rate": 0.0002, + "loss": 0.9479, + "step": 3900 + }, + { + "epoch": 2.8676200953428674, + "grad_norm": 1.1583497524261475, + "learning_rate": 0.0002, + "loss": 0.9409, + "step": 3910 + }, + { + "epoch": 2.874954162082875, + "grad_norm": 0.7263903021812439, + "learning_rate": 0.0002, + "loss": 0.895, + "step": 3920 + }, + { + "epoch": 2.8822882288228824, + "grad_norm": 0.5369910001754761, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 3930 + }, + { + "epoch": 2.88962229556289, + "grad_norm": 0.7298350930213928, + "learning_rate": 0.0002, + "loss": 1.0015, + "step": 3940 + }, + { + "epoch": 2.896956362302897, + "grad_norm": 0.577012836933136, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 3950 + }, + { + "epoch": 2.9042904290429044, + "grad_norm": 0.5859594345092773, + "learning_rate": 0.0002, + "loss": 0.9716, + "step": 3960 + }, + { + "epoch": 2.9116244957829114, + "grad_norm": 0.47176122665405273, + "learning_rate": 0.0002, + "loss": 0.8772, + "step": 3970 + }, + { + "epoch": 2.918958562522919, + "grad_norm": 0.9699620604515076, + "learning_rate": 0.0002, + "loss": 0.8997, + "step": 3980 + }, + { + "epoch": 2.9262926292629263, + "grad_norm": 0.7908747792243958, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3990 + }, + { + "epoch": 2.933626696002934, + "grad_norm": 0.5777379274368286, + "learning_rate": 0.0002, + "loss": 0.9462, + "step": 4000 + }, + { + "epoch": 2.940960762742941, + "grad_norm": 0.599288284778595, + "learning_rate": 0.0002, + "loss": 0.9358, + "step": 4010 + }, + { + "epoch": 2.9482948294829483, + "grad_norm": 0.5232274532318115, + "learning_rate": 0.0002, + "loss": 0.9812, + "step": 4020 + }, + { + "epoch": 2.9556288962229558, + "grad_norm": 0.6395137310028076, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 4030 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.589260458946228, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 4040 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.5699581503868103, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 4050 + }, + { + "epoch": 2.9776310964429777, + "grad_norm": 0.528468132019043, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 4060 + }, + { + "epoch": 2.984965163182985, + "grad_norm": 0.4804670512676239, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 4070 + }, + { + "epoch": 2.9922992299229922, + "grad_norm": 1.1918889284133911, + "learning_rate": 0.0002, + "loss": 0.9771, + "step": 4080 + }, + { + "epoch": 2.9996332966629997, + "grad_norm": 0.5479103326797485, + "learning_rate": 0.0002, + "loss": 0.9178, + "step": 4090 + }, + { + "epoch": 2.9996332966629997, + "eval_loss": 1.1642853021621704, + "eval_runtime": 32.7511, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.649, + "step": 4090 + }, + { + "epoch": 3.006967363403007, + "grad_norm": 0.7430027723312378, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 4100 + }, + { + "epoch": 3.014301430143014, + "grad_norm": 0.6293647289276123, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4110 + }, + { + "epoch": 3.0216354968830217, + "grad_norm": 0.6191329956054688, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 4120 + }, + { + "epoch": 3.028969563623029, + "grad_norm": 0.7959313988685608, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4130 + }, + { + "epoch": 3.036303630363036, + "grad_norm": 0.5956351161003113, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 4140 + }, + { + "epoch": 3.0436376971030437, + "grad_norm": 0.670383632183075, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 4150 + }, + { + "epoch": 3.050971763843051, + "grad_norm": 0.6414518356323242, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 4160 + }, + { + "epoch": 3.058305830583058, + "grad_norm": 0.7928852438926697, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 4170 + }, + { + "epoch": 3.0656398973230656, + "grad_norm": 0.6211121082305908, + "learning_rate": 0.0002, + "loss": 0.7914, + "step": 4180 + }, + { + "epoch": 3.072973964063073, + "grad_norm": 0.6237057447433472, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 4190 + }, + { + "epoch": 3.08030803080308, + "grad_norm": 0.6522233486175537, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 4200 + }, + { + "epoch": 3.0876420975430876, + "grad_norm": 0.9396848678588867, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4210 + }, + { + "epoch": 3.094976164283095, + "grad_norm": 0.8003010749816895, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 4220 + }, + { + "epoch": 3.102310231023102, + "grad_norm": 0.6733810305595398, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 4230 + }, + { + "epoch": 3.1096442977631096, + "grad_norm": 0.6365828514099121, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 4240 + }, + { + "epoch": 3.116978364503117, + "grad_norm": 1.0805548429489136, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4250 + }, + { + "epoch": 3.1243124312431245, + "grad_norm": 0.7262141108512878, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4260 + }, + { + "epoch": 3.1316464979831315, + "grad_norm": 0.5500539541244507, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 4270 + }, + { + "epoch": 3.138980564723139, + "grad_norm": 0.793912947177887, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 4280 + }, + { + "epoch": 3.1463146314631465, + "grad_norm": 1.2540518045425415, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 4290 + }, + { + "epoch": 3.1536486982031535, + "grad_norm": 0.7020077705383301, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 4300 + }, + { + "epoch": 3.160982764943161, + "grad_norm": 0.5111123323440552, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 4310 + }, + { + "epoch": 3.1683168316831685, + "grad_norm": 0.7172090411186218, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 4320 + }, + { + "epoch": 3.1756508984231755, + "grad_norm": 0.6343168616294861, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 4330 + }, + { + "epoch": 3.182984965163183, + "grad_norm": 0.9563672542572021, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4340 + }, + { + "epoch": 3.1903190319031904, + "grad_norm": 1.0225574970245361, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4350 + }, + { + "epoch": 3.1976530986431975, + "grad_norm": 1.1633386611938477, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 4360 + }, + { + "epoch": 3.204987165383205, + "grad_norm": 0.8915148973464966, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 4370 + }, + { + "epoch": 3.2123212321232124, + "grad_norm": 0.9156812429428101, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4380 + }, + { + "epoch": 3.21965529886322, + "grad_norm": 0.6363258957862854, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 4390 + }, + { + "epoch": 3.226989365603227, + "grad_norm": 0.579099178314209, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 4400 + }, + { + "epoch": 3.2343234323432344, + "grad_norm": 0.8778146505355835, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 4410 + }, + { + "epoch": 3.241657499083242, + "grad_norm": 0.8356770873069763, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 4420 + }, + { + "epoch": 3.248991565823249, + "grad_norm": 0.702032208442688, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 4430 + }, + { + "epoch": 3.2563256325632564, + "grad_norm": 0.6386539340019226, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 4440 + }, + { + "epoch": 3.263659699303264, + "grad_norm": 0.7008408904075623, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 4450 + }, + { + "epoch": 3.270993766043271, + "grad_norm": 0.9556332230567932, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 4460 + }, + { + "epoch": 3.2783278327832783, + "grad_norm": 0.5667835474014282, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 4470 + }, + { + "epoch": 3.285661899523286, + "grad_norm": 0.8239172697067261, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 4480 + }, + { + "epoch": 3.292995966263293, + "grad_norm": 0.7045050859451294, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 4490 + }, + { + "epoch": 3.3003300330033003, + "grad_norm": 0.7131434082984924, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 4500 + }, + { + "epoch": 3.3076640997433078, + "grad_norm": 0.6924910545349121, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 4510 + }, + { + "epoch": 3.3149981664833152, + "grad_norm": 0.8945356607437134, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 4520 + }, + { + "epoch": 3.3223322332233223, + "grad_norm": 0.6546903252601624, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 4530 + }, + { + "epoch": 3.3296662999633297, + "grad_norm": 0.8206679224967957, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4540 + }, + { + "epoch": 3.3370003667033368, + "grad_norm": 0.6482203602790833, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 4550 + }, + { + "epoch": 3.3443344334433442, + "grad_norm": 0.7558760046958923, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 4560 + }, + { + "epoch": 3.3516685001833517, + "grad_norm": 0.7794756889343262, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 4570 + }, + { + "epoch": 3.359002566923359, + "grad_norm": 0.7382805943489075, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 4580 + }, + { + "epoch": 3.366336633663366, + "grad_norm": 0.5912511944770813, + "learning_rate": 0.0002, + "loss": 0.8511, + "step": 4590 + }, + { + "epoch": 3.3736707004033737, + "grad_norm": 0.7444885969161987, + "learning_rate": 0.0002, + "loss": 0.8272, + "step": 4600 + }, + { + "epoch": 3.381004767143381, + "grad_norm": 0.7354922890663147, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 4610 + }, + { + "epoch": 3.388338833883388, + "grad_norm": 0.7685934901237488, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 4620 + }, + { + "epoch": 3.3956729006233957, + "grad_norm": 0.61041259765625, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 4630 + }, + { + "epoch": 3.403006967363403, + "grad_norm": 0.6820451021194458, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 4640 + }, + { + "epoch": 3.41034103410341, + "grad_norm": 0.5819534063339233, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 4650 + }, + { + "epoch": 3.4176751008434176, + "grad_norm": 0.705410897731781, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 4660 + }, + { + "epoch": 3.425009167583425, + "grad_norm": 0.8052892088890076, + "learning_rate": 0.0002, + "loss": 0.7901, + "step": 4670 + }, + { + "epoch": 3.432343234323432, + "grad_norm": 0.7746483087539673, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 4680 + }, + { + "epoch": 3.4396773010634396, + "grad_norm": 0.7713689804077148, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 4690 + }, + { + "epoch": 3.447011367803447, + "grad_norm": 0.810371994972229, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 4700 + }, + { + "epoch": 3.4543454345434546, + "grad_norm": 0.7702969312667847, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4710 + }, + { + "epoch": 3.4616795012834616, + "grad_norm": 0.7069268822669983, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4720 + }, + { + "epoch": 3.469013568023469, + "grad_norm": 0.7640359401702881, + "learning_rate": 0.0002, + "loss": 0.8199, + "step": 4730 + }, + { + "epoch": 3.4763476347634765, + "grad_norm": 0.8661707639694214, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 4740 + }, + { + "epoch": 3.4836817015034836, + "grad_norm": 0.9970282912254333, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 4750 + }, + { + "epoch": 3.491015768243491, + "grad_norm": 0.5824355483055115, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 4760 + }, + { + "epoch": 3.4983498349834985, + "grad_norm": 1.3072649240493774, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 4770 + }, + { + "epoch": 3.5056839017235055, + "grad_norm": 0.873978316783905, + "learning_rate": 0.0002, + "loss": 0.9101, + "step": 4780 + }, + { + "epoch": 3.513017968463513, + "grad_norm": 0.5526657104492188, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4790 + }, + { + "epoch": 3.5203520352035205, + "grad_norm": 0.790894627571106, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 4800 + }, + { + "epoch": 3.5276861019435275, + "grad_norm": 0.8119630217552185, + "learning_rate": 0.0002, + "loss": 0.831, + "step": 4810 + }, + { + "epoch": 3.535020168683535, + "grad_norm": 0.633212149143219, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 4820 + }, + { + "epoch": 3.5423542354235424, + "grad_norm": 0.703029990196228, + "learning_rate": 0.0002, + "loss": 0.8505, + "step": 4830 + }, + { + "epoch": 3.54968830216355, + "grad_norm": 0.7603771686553955, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 4840 + }, + { + "epoch": 3.557022368903557, + "grad_norm": 0.6260480880737305, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 4850 + }, + { + "epoch": 3.5643564356435644, + "grad_norm": 0.8203664422035217, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 4860 + }, + { + "epoch": 3.5716905023835714, + "grad_norm": 0.7793813347816467, + "learning_rate": 0.0002, + "loss": 0.8821, + "step": 4870 + }, + { + "epoch": 3.579024569123579, + "grad_norm": 0.7667397260665894, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 4880 + }, + { + "epoch": 3.5863586358635864, + "grad_norm": 0.8198829889297485, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 4890 + }, + { + "epoch": 3.593692702603594, + "grad_norm": 0.7689233422279358, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 4900 + }, + { + "epoch": 3.601026769343601, + "grad_norm": 0.7870983481407166, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 4910 + }, + { + "epoch": 3.6083608360836084, + "grad_norm": 0.8133853077888489, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 4920 + }, + { + "epoch": 3.615694902823616, + "grad_norm": 1.308401346206665, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 4930 + }, + { + "epoch": 3.623028969563623, + "grad_norm": 0.7131121754646301, + "learning_rate": 0.0002, + "loss": 0.8494, + "step": 4940 + }, + { + "epoch": 3.6303630363036303, + "grad_norm": 0.6825910210609436, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 4950 + }, + { + "epoch": 3.637697103043638, + "grad_norm": 0.7254678606987, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4960 + }, + { + "epoch": 3.6450311697836453, + "grad_norm": 0.8045085072517395, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4970 + }, + { + "epoch": 3.6523652365236523, + "grad_norm": 0.6991777420043945, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 4980 + }, + { + "epoch": 3.6596993032636598, + "grad_norm": 0.7804713249206543, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 4990 + }, + { + "epoch": 3.667033370003667, + "grad_norm": 0.8525708317756653, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 5000 + }, + { + "epoch": 3.6743674367436743, + "grad_norm": 0.7959994673728943, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 5010 + }, + { + "epoch": 3.6817015034836817, + "grad_norm": 0.8103628158569336, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5020 + }, + { + "epoch": 3.689035570223689, + "grad_norm": 0.7517836093902588, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 5030 + }, + { + "epoch": 3.6963696369636962, + "grad_norm": 0.6878514289855957, + "learning_rate": 0.0002, + "loss": 0.8375, + "step": 5040 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 1.2371820211410522, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 5050 + }, + { + "epoch": 3.711037770443711, + "grad_norm": 0.6567103862762451, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 5060 + }, + { + "epoch": 3.718371837183718, + "grad_norm": 1.1254922151565552, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 5070 + }, + { + "epoch": 3.7257059039237257, + "grad_norm": 0.6796132326126099, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 5080 + }, + { + "epoch": 3.733039970663733, + "grad_norm": 0.7285300493240356, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5090 + }, + { + "epoch": 3.7403740374037406, + "grad_norm": 0.8931500911712646, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 5100 + }, + { + "epoch": 3.7477081041437477, + "grad_norm": 0.6256856918334961, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 5110 + }, + { + "epoch": 3.755042170883755, + "grad_norm": 0.79310142993927, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5120 + }, + { + "epoch": 3.762376237623762, + "grad_norm": 0.6594041585922241, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 5130 + }, + { + "epoch": 3.7697103043637696, + "grad_norm": 0.7029327750205994, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 5140 + }, + { + "epoch": 3.777044371103777, + "grad_norm": 0.5880070328712463, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 5150 + }, + { + "epoch": 3.7843784378437846, + "grad_norm": 0.7578945159912109, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 5160 + }, + { + "epoch": 3.7917125045837916, + "grad_norm": 0.8276378512382507, + "learning_rate": 0.0002, + "loss": 0.8819, + "step": 5170 + }, + { + "epoch": 3.799046571323799, + "grad_norm": 0.7627953886985779, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 5180 + }, + { + "epoch": 3.806380638063806, + "grad_norm": 0.8169086575508118, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 5190 + }, + { + "epoch": 3.8137147048038136, + "grad_norm": 0.6605030298233032, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 5200 + }, + { + "epoch": 3.821048771543821, + "grad_norm": 0.5837286114692688, + "learning_rate": 0.0002, + "loss": 0.8804, + "step": 5210 + }, + { + "epoch": 3.8283828382838285, + "grad_norm": 1.2422157526016235, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 5220 + }, + { + "epoch": 3.8357169050238356, + "grad_norm": 0.6589220762252808, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 5230 + }, + { + "epoch": 3.843050971763843, + "grad_norm": 0.8567556142807007, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5240 + }, + { + "epoch": 3.8503850385038505, + "grad_norm": 0.6490627527236938, + "learning_rate": 0.0002, + "loss": 0.8652, + "step": 5250 + }, + { + "epoch": 3.8577191052438575, + "grad_norm": 0.620232880115509, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5260 + }, + { + "epoch": 3.865053171983865, + "grad_norm": 0.7685128450393677, + "learning_rate": 0.0002, + "loss": 0.9192, + "step": 5270 + }, + { + "epoch": 3.8723872387238725, + "grad_norm": 0.8113296627998352, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 5280 + }, + { + "epoch": 3.87972130546388, + "grad_norm": 0.8092675805091858, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 5290 + }, + { + "epoch": 3.887055372203887, + "grad_norm": 0.583570122718811, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 5300 + }, + { + "epoch": 3.8943894389438944, + "grad_norm": 1.712363600730896, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 5310 + }, + { + "epoch": 3.9017235056839015, + "grad_norm": 0.6673534512519836, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5320 + }, + { + "epoch": 3.909057572423909, + "grad_norm": 1.9770312309265137, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5330 + }, + { + "epoch": 3.9163916391639164, + "grad_norm": 0.6430999636650085, + "learning_rate": 0.0002, + "loss": 0.8793, + "step": 5340 + }, + { + "epoch": 3.923725705903924, + "grad_norm": 1.0159571170806885, + "learning_rate": 0.0002, + "loss": 0.839, + "step": 5350 + }, + { + "epoch": 3.931059772643931, + "grad_norm": 0.8607584834098816, + "learning_rate": 0.0002, + "loss": 0.9332, + "step": 5360 + }, + { + "epoch": 3.9383938393839384, + "grad_norm": 0.6967900991439819, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 5370 + }, + { + "epoch": 3.945727906123946, + "grad_norm": 0.7683077454566956, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 5380 + }, + { + "epoch": 3.953061972863953, + "grad_norm": 0.6805762648582458, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5390 + }, + { + "epoch": 3.9603960396039604, + "grad_norm": 0.7033619284629822, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5400 + }, + { + "epoch": 3.967730106343968, + "grad_norm": 0.966112494468689, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5410 + }, + { + "epoch": 3.9750641730839753, + "grad_norm": 0.8467881083488464, + "learning_rate": 0.0002, + "loss": 0.8316, + "step": 5420 + }, + { + "epoch": 3.9823982398239823, + "grad_norm": 0.8005317449569702, + "learning_rate": 0.0002, + "loss": 0.8084, + "step": 5430 + }, + { + "epoch": 3.98973230656399, + "grad_norm": 1.1615241765975952, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 5440 + }, + { + "epoch": 3.997066373303997, + "grad_norm": 0.6121614575386047, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 5450 + }, + { + "epoch": 4.0, + "eval_loss": 1.1834222078323364, + "eval_runtime": 32.7569, + "eval_samples_per_second": 13.158, + "eval_steps_per_second": 1.649, + "step": 5454 + }, + { + "epoch": 4.004400440044004, + "grad_norm": 0.6055727005004883, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 5460 + }, + { + "epoch": 4.011734506784012, + "grad_norm": 0.8232647180557251, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 5470 + }, + { + "epoch": 4.019068573524019, + "grad_norm": 0.7739192247390747, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 5480 + }, + { + "epoch": 4.026402640264027, + "grad_norm": 0.6264950633049011, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 5490 + }, + { + "epoch": 4.033736707004033, + "grad_norm": 1.4798702001571655, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 5500 + }, + { + "epoch": 4.041070773744041, + "grad_norm": 0.9538470506668091, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 5510 + }, + { + "epoch": 4.048404840484048, + "grad_norm": 0.834561288356781, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 5520 + }, + { + "epoch": 4.055738907224056, + "grad_norm": 0.6407850384712219, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 5530 + }, + { + "epoch": 4.063072973964063, + "grad_norm": 0.9035961627960205, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 5540 + }, + { + "epoch": 4.070407040704071, + "grad_norm": 0.842812716960907, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 5550 + }, + { + "epoch": 4.077741107444078, + "grad_norm": 0.8197882175445557, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 5560 + }, + { + "epoch": 4.085075174184085, + "grad_norm": 0.8652673959732056, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 5570 + }, + { + "epoch": 4.092409240924092, + "grad_norm": 0.8048318028450012, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 5580 + }, + { + "epoch": 4.0997433076641, + "grad_norm": 0.9604969024658203, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 5590 + }, + { + "epoch": 4.107077374404107, + "grad_norm": 1.244756817817688, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 5600 + }, + { + "epoch": 4.114411441144115, + "grad_norm": 0.7975269556045532, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 5610 + }, + { + "epoch": 4.121745507884122, + "grad_norm": 0.6130099296569824, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 5620 + }, + { + "epoch": 4.129079574624129, + "grad_norm": 0.7793202996253967, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 5630 + }, + { + "epoch": 4.136413641364136, + "grad_norm": 1.187238335609436, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 5640 + }, + { + "epoch": 4.143747708104144, + "grad_norm": 0.8450375199317932, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 5650 + }, + { + "epoch": 4.151081774844151, + "grad_norm": 0.9006940126419067, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5660 + }, + { + "epoch": 4.158415841584159, + "grad_norm": 0.9447154998779297, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 5670 + }, + { + "epoch": 4.165749908324166, + "grad_norm": 0.798032283782959, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 5680 + }, + { + "epoch": 4.1730839750641735, + "grad_norm": 0.65578693151474, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 5690 + }, + { + "epoch": 4.18041804180418, + "grad_norm": 1.0864700078964233, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 5700 + }, + { + "epoch": 4.187752108544188, + "grad_norm": 0.7344121932983398, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 5710 + }, + { + "epoch": 4.195086175284195, + "grad_norm": 0.9722456932067871, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 5720 + }, + { + "epoch": 4.2024202420242025, + "grad_norm": 1.263814926147461, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 5730 + }, + { + "epoch": 4.20975430876421, + "grad_norm": 0.9622581005096436, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 5740 + }, + { + "epoch": 4.2170883755042174, + "grad_norm": 0.8497143387794495, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 5750 + }, + { + "epoch": 4.224422442244224, + "grad_norm": 0.8248446583747864, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 5760 + }, + { + "epoch": 4.2317565089842315, + "grad_norm": 1.2544798851013184, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 5770 + }, + { + "epoch": 4.239090575724239, + "grad_norm": 0.8224676251411438, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 5780 + }, + { + "epoch": 4.2464246424642464, + "grad_norm": 0.8924877047538757, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 5790 + }, + { + "epoch": 4.253758709204254, + "grad_norm": 0.8545848727226257, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 5800 + }, + { + "epoch": 4.261092775944261, + "grad_norm": 0.8081067800521851, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 5810 + }, + { + "epoch": 4.268426842684269, + "grad_norm": 0.7111002802848816, + "learning_rate": 0.0002, + "loss": 0.6149, + "step": 5820 + }, + { + "epoch": 4.2757609094242754, + "grad_norm": 0.8696979880332947, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 5830 + }, + { + "epoch": 4.283094976164283, + "grad_norm": 0.821401834487915, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 5840 + }, + { + "epoch": 4.29042904290429, + "grad_norm": 0.888908326625824, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 5850 + }, + { + "epoch": 4.297763109644298, + "grad_norm": 1.9380123615264893, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 5860 + }, + { + "epoch": 4.305097176384305, + "grad_norm": 1.121774435043335, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 5870 + }, + { + "epoch": 4.312431243124313, + "grad_norm": 0.9238282442092896, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 5880 + }, + { + "epoch": 4.319765309864319, + "grad_norm": 0.7321620583534241, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 5890 + }, + { + "epoch": 4.327099376604327, + "grad_norm": 0.8739548325538635, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 5900 + }, + { + "epoch": 4.334433443344334, + "grad_norm": 0.9686012268066406, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 5910 + }, + { + "epoch": 4.341767510084342, + "grad_norm": 0.9033839106559753, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 5920 + }, + { + "epoch": 4.349101576824349, + "grad_norm": 0.8131115436553955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 5930 + }, + { + "epoch": 4.356435643564357, + "grad_norm": 0.8942412734031677, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 5940 + }, + { + "epoch": 4.363769710304364, + "grad_norm": 0.8439112901687622, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 5950 + }, + { + "epoch": 4.371103777044371, + "grad_norm": 0.9176713228225708, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 5960 + }, + { + "epoch": 4.378437843784378, + "grad_norm": 0.6799634695053101, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 5970 + }, + { + "epoch": 4.385771910524386, + "grad_norm": 1.0435824394226074, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5980 + }, + { + "epoch": 4.393105977264393, + "grad_norm": 0.997937798500061, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 5990 + }, + { + "epoch": 4.400440044004401, + "grad_norm": 1.0308842658996582, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 6000 + }, + { + "epoch": 4.407774110744408, + "grad_norm": 1.3683775663375854, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 6010 + }, + { + "epoch": 4.415108177484415, + "grad_norm": 0.7569534182548523, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 6020 + }, + { + "epoch": 4.422442244224422, + "grad_norm": 1.089978575706482, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 6030 + }, + { + "epoch": 4.42977631096443, + "grad_norm": 0.7522459626197815, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 6040 + }, + { + "epoch": 4.437110377704437, + "grad_norm": 0.6709823608398438, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 6050 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.6992089748382568, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 6060 + }, + { + "epoch": 4.451778511184452, + "grad_norm": 1.0182931423187256, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 6070 + }, + { + "epoch": 4.459112577924459, + "grad_norm": 1.0685160160064697, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 6080 + }, + { + "epoch": 4.466446644664466, + "grad_norm": 0.8295124769210815, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 6090 + }, + { + "epoch": 4.473780711404474, + "grad_norm": 1.1862998008728027, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6100 + }, + { + "epoch": 4.481114778144481, + "grad_norm": 0.7400273084640503, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 6110 + }, + { + "epoch": 4.488448844884489, + "grad_norm": 0.7098417282104492, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 6120 + }, + { + "epoch": 4.495782911624496, + "grad_norm": 0.9745053648948669, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 6130 + }, + { + "epoch": 4.503116978364503, + "grad_norm": 0.8638797998428345, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 6140 + }, + { + "epoch": 4.51045104510451, + "grad_norm": 0.8291046619415283, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6150 + }, + { + "epoch": 4.517785111844518, + "grad_norm": 1.0301737785339355, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 6160 + }, + { + "epoch": 4.525119178584525, + "grad_norm": 1.1996512413024902, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 6170 + }, + { + "epoch": 4.5324532453245325, + "grad_norm": 1.151038408279419, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 6180 + }, + { + "epoch": 4.53978731206454, + "grad_norm": 0.8385201096534729, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 6190 + }, + { + "epoch": 4.5471213788045475, + "grad_norm": 0.8969188332557678, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 6200 + }, + { + "epoch": 4.554455445544555, + "grad_norm": 1.60659658908844, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 6210 + }, + { + "epoch": 4.5617895122845615, + "grad_norm": 0.9356731176376343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 6220 + }, + { + "epoch": 4.569123579024569, + "grad_norm": 0.95856773853302, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 6230 + }, + { + "epoch": 4.5764576457645765, + "grad_norm": 1.1162524223327637, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 6240 + }, + { + "epoch": 4.583791712504584, + "grad_norm": 0.8809238076210022, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 6250 + }, + { + "epoch": 4.591125779244591, + "grad_norm": 0.890738844871521, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 6260 + }, + { + "epoch": 4.598459845984598, + "grad_norm": 0.918684720993042, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 6270 + }, + { + "epoch": 4.6057939127246055, + "grad_norm": 0.8156296610832214, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 6280 + }, + { + "epoch": 4.613127979464613, + "grad_norm": 1.046634316444397, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6290 + }, + { + "epoch": 4.62046204620462, + "grad_norm": 0.7725525498390198, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 6300 + }, + { + "epoch": 4.627796112944628, + "grad_norm": 0.9992046356201172, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 6310 + }, + { + "epoch": 4.635130179684635, + "grad_norm": 0.8480095267295837, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 6320 + }, + { + "epoch": 4.642464246424643, + "grad_norm": 0.7061955332756042, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 6330 + }, + { + "epoch": 4.649798313164649, + "grad_norm": 1.0354212522506714, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 6340 + }, + { + "epoch": 4.657132379904657, + "grad_norm": 1.0081377029418945, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6350 + }, + { + "epoch": 4.664466446644664, + "grad_norm": 1.2904249429702759, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 6360 + }, + { + "epoch": 4.671800513384672, + "grad_norm": 0.9248910546302795, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 6370 + }, + { + "epoch": 4.679134580124679, + "grad_norm": 0.9907804131507874, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 6380 + }, + { + "epoch": 4.686468646864687, + "grad_norm": 1.201143741607666, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 6390 + }, + { + "epoch": 4.693802713604693, + "grad_norm": 0.8709394335746765, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 6400 + }, + { + "epoch": 4.701136780344701, + "grad_norm": 0.7468608021736145, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 6410 + }, + { + "epoch": 4.708470847084708, + "grad_norm": 0.8607903718948364, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 6420 + }, + { + "epoch": 4.715804913824716, + "grad_norm": 0.9840512871742249, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 6430 + }, + { + "epoch": 4.723138980564723, + "grad_norm": 0.8328204154968262, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 6440 + }, + { + "epoch": 4.730473047304731, + "grad_norm": 0.924505352973938, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 6450 + }, + { + "epoch": 4.737807114044738, + "grad_norm": 0.8897685408592224, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 6460 + }, + { + "epoch": 4.745141180784745, + "grad_norm": 0.9605024456977844, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6470 + }, + { + "epoch": 4.752475247524752, + "grad_norm": 0.8150759935379028, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 6480 + }, + { + "epoch": 4.75980931426476, + "grad_norm": 0.8128412961959839, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 6490 + }, + { + "epoch": 4.767143381004767, + "grad_norm": 0.7381404638290405, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 6500 + }, + { + "epoch": 4.774477447744775, + "grad_norm": 1.0565853118896484, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 6510 + }, + { + "epoch": 4.781811514484782, + "grad_norm": 0.9298134446144104, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6520 + }, + { + "epoch": 4.789145581224789, + "grad_norm": 1.0145525932312012, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 6530 + }, + { + "epoch": 4.796479647964796, + "grad_norm": 0.92259681224823, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 6540 + }, + { + "epoch": 4.803813714704804, + "grad_norm": 0.7881024479866028, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 6550 + }, + { + "epoch": 4.811147781444811, + "grad_norm": 1.4935206174850464, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 6560 + }, + { + "epoch": 4.818481848184819, + "grad_norm": 0.8612369298934937, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 6570 + }, + { + "epoch": 4.825815914924826, + "grad_norm": 1.0118653774261475, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 6580 + }, + { + "epoch": 4.833149981664834, + "grad_norm": 1.1303809881210327, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 6590 + }, + { + "epoch": 4.84048404840484, + "grad_norm": 0.9112492203712463, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 6600 + }, + { + "epoch": 4.847818115144848, + "grad_norm": 0.864762544631958, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 6610 + }, + { + "epoch": 4.855152181884855, + "grad_norm": 0.9090572595596313, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 6620 + }, + { + "epoch": 4.862486248624863, + "grad_norm": 1.014953374862671, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 6630 + }, + { + "epoch": 4.86982031536487, + "grad_norm": 1.0702149868011475, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 6640 + }, + { + "epoch": 4.8771543821048775, + "grad_norm": 1.002135157585144, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 6650 + }, + { + "epoch": 4.884488448844884, + "grad_norm": 0.862545907497406, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 6660 + }, + { + "epoch": 4.891822515584892, + "grad_norm": 0.7302131056785583, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 6670 + }, + { + "epoch": 4.899156582324899, + "grad_norm": 0.8380730152130127, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 6680 + }, + { + "epoch": 4.9064906490649065, + "grad_norm": 0.7956018447875977, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 6690 + }, + { + "epoch": 4.913824715804914, + "grad_norm": 0.6717583537101746, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 6700 + }, + { + "epoch": 4.9211587825449215, + "grad_norm": 1.09099280834198, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 6710 + }, + { + "epoch": 4.928492849284929, + "grad_norm": 0.8589889407157898, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 6720 + }, + { + "epoch": 4.9358269160249355, + "grad_norm": 1.0046314001083374, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 6730 + }, + { + "epoch": 4.943160982764943, + "grad_norm": 0.8559659123420715, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 6740 + }, + { + "epoch": 4.9504950495049505, + "grad_norm": 0.8588525652885437, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 6750 + }, + { + "epoch": 4.957829116244958, + "grad_norm": 0.9192708134651184, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 6760 + }, + { + "epoch": 4.965163182984965, + "grad_norm": 1.051398754119873, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 6770 + }, + { + "epoch": 4.972497249724973, + "grad_norm": 0.9111362099647522, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 6780 + }, + { + "epoch": 4.9798313164649795, + "grad_norm": 0.7305638194084167, + "learning_rate": 0.0002, + "loss": 0.7613, + "step": 6790 + }, + { + "epoch": 4.987165383204987, + "grad_norm": 1.118837594985962, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 6800 + }, + { + "epoch": 4.994499449944994, + "grad_norm": 0.9075239300727844, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 6810 + }, + { + "epoch": 4.999633296662999, + "eval_loss": 1.2361247539520264, + "eval_runtime": 32.7325, + "eval_samples_per_second": 13.167, + "eval_steps_per_second": 1.65, + "step": 6817 + }, + { + "epoch": 5.001833516685002, + "grad_norm": 1.0541315078735352, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 6820 + }, + { + "epoch": 5.009167583425009, + "grad_norm": 0.9750140905380249, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 6830 + }, + { + "epoch": 5.016501650165017, + "grad_norm": 0.931838870048523, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 6840 + }, + { + "epoch": 5.023835716905023, + "grad_norm": 1.110278844833374, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 6850 + }, + { + "epoch": 5.031169783645031, + "grad_norm": 1.0670180320739746, + "learning_rate": 0.0002, + "loss": 0.4676, + "step": 6860 + }, + { + "epoch": 5.038503850385038, + "grad_norm": 0.8762092590332031, + "learning_rate": 0.0002, + "loss": 0.4374, + "step": 6870 + }, + { + "epoch": 5.045837917125046, + "grad_norm": 1.1169432401657104, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 6880 + }, + { + "epoch": 5.053171983865053, + "grad_norm": 1.005491018295288, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 6890 + }, + { + "epoch": 5.060506050605061, + "grad_norm": 1.1751841306686401, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 6900 + }, + { + "epoch": 5.067840117345068, + "grad_norm": 0.8501367568969727, + "learning_rate": 0.0002, + "loss": 0.451, + "step": 6910 + }, + { + "epoch": 5.075174184085075, + "grad_norm": 0.9795131683349609, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 6920 + }, + { + "epoch": 5.082508250825082, + "grad_norm": 0.8929879665374756, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 6930 + }, + { + "epoch": 5.08984231756509, + "grad_norm": 1.0156651735305786, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 6940 + }, + { + "epoch": 5.097176384305097, + "grad_norm": 1.0974335670471191, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 6950 + }, + { + "epoch": 5.104510451045105, + "grad_norm": 1.7015666961669922, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 6960 + }, + { + "epoch": 5.111844517785112, + "grad_norm": 1.0343226194381714, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 6970 + }, + { + "epoch": 5.119178584525119, + "grad_norm": 1.3072983026504517, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 6980 + }, + { + "epoch": 5.126512651265126, + "grad_norm": 1.038986086845398, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 6990 + }, + { + "epoch": 5.133846718005134, + "grad_norm": 0.8638386130332947, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 7000 + }, + { + "epoch": 5.141180784745141, + "grad_norm": 0.8326523900032043, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 7010 + }, + { + "epoch": 5.148514851485149, + "grad_norm": 1.0976895093917847, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 7020 + }, + { + "epoch": 5.155848918225156, + "grad_norm": 1.0077873468399048, + "learning_rate": 0.0002, + "loss": 0.4677, + "step": 7030 + }, + { + "epoch": 5.163182984965164, + "grad_norm": 1.0662257671356201, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 7040 + }, + { + "epoch": 5.17051705170517, + "grad_norm": 1.206271767616272, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 7050 + }, + { + "epoch": 5.177851118445178, + "grad_norm": 1.1990262269973755, + "learning_rate": 0.0002, + "loss": 0.4817, + "step": 7060 + }, + { + "epoch": 5.185185185185185, + "grad_norm": 1.0207163095474243, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 7070 + }, + { + "epoch": 5.192519251925193, + "grad_norm": 1.2783987522125244, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 7080 + }, + { + "epoch": 5.1998533186652, + "grad_norm": 1.1592512130737305, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 7090 + }, + { + "epoch": 5.2071873854052075, + "grad_norm": 1.1053160429000854, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 7100 + }, + { + "epoch": 5.214521452145214, + "grad_norm": 1.1925510168075562, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 7110 + }, + { + "epoch": 5.221855518885222, + "grad_norm": 1.0714877843856812, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 7120 + }, + { + "epoch": 5.229189585625229, + "grad_norm": 0.9451011419296265, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 7130 + }, + { + "epoch": 5.2365236523652365, + "grad_norm": 1.03838050365448, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 7140 + }, + { + "epoch": 5.243857719105244, + "grad_norm": 0.9204146265983582, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 7150 + }, + { + "epoch": 5.2511917858452515, + "grad_norm": 1.0142229795455933, + "learning_rate": 0.0002, + "loss": 0.5164, + "step": 7160 + }, + { + "epoch": 5.258525852585258, + "grad_norm": 1.4432005882263184, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 7170 + }, + { + "epoch": 5.2658599193252655, + "grad_norm": 1.1239633560180664, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 7180 + }, + { + "epoch": 5.273193986065273, + "grad_norm": 0.7012821435928345, + "learning_rate": 0.0002, + "loss": 0.4969, + "step": 7190 + }, + { + "epoch": 5.2805280528052805, + "grad_norm": 1.3499128818511963, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 7200 + }, + { + "epoch": 5.287862119545288, + "grad_norm": 0.9498730897903442, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 7210 + }, + { + "epoch": 5.295196186285295, + "grad_norm": 0.9552369117736816, + "learning_rate": 0.0002, + "loss": 0.5051, + "step": 7220 + }, + { + "epoch": 5.302530253025303, + "grad_norm": 0.7610348463058472, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 7230 + }, + { + "epoch": 5.3098643197653095, + "grad_norm": 1.0314512252807617, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 7240 + }, + { + "epoch": 5.317198386505317, + "grad_norm": 1.0534334182739258, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 7250 + }, + { + "epoch": 5.324532453245324, + "grad_norm": 1.2553406953811646, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 7260 + }, + { + "epoch": 5.331866519985332, + "grad_norm": 0.7061691880226135, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 7270 + }, + { + "epoch": 5.339200586725339, + "grad_norm": 0.9652578830718994, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 7280 + }, + { + "epoch": 5.346534653465347, + "grad_norm": 1.114788293838501, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 7290 + }, + { + "epoch": 5.353868720205353, + "grad_norm": 1.0940049886703491, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 7300 + }, + { + "epoch": 5.361202786945361, + "grad_norm": 1.0151008367538452, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 7310 + }, + { + "epoch": 5.368536853685368, + "grad_norm": 1.0369552373886108, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 7320 + }, + { + "epoch": 5.375870920425376, + "grad_norm": 0.8489866256713867, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 7330 + }, + { + "epoch": 5.383204987165383, + "grad_norm": 1.1031713485717773, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 7340 + }, + { + "epoch": 5.390539053905391, + "grad_norm": 0.9094716310501099, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 7350 + }, + { + "epoch": 5.397873120645398, + "grad_norm": 0.9530431032180786, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 7360 + }, + { + "epoch": 5.405207187385405, + "grad_norm": 0.9633604884147644, + "learning_rate": 0.0002, + "loss": 0.529, + "step": 7370 + }, + { + "epoch": 5.412541254125412, + "grad_norm": 0.9541662335395813, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 7380 + }, + { + "epoch": 5.41987532086542, + "grad_norm": 1.0459771156311035, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7390 + }, + { + "epoch": 5.427209387605427, + "grad_norm": 1.027388334274292, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 7400 + }, + { + "epoch": 5.434543454345435, + "grad_norm": 0.7267653346061707, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 7410 + }, + { + "epoch": 5.441877521085442, + "grad_norm": 1.020142674446106, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 7420 + }, + { + "epoch": 5.449211587825449, + "grad_norm": 1.044754147529602, + "learning_rate": 0.0002, + "loss": 0.4853, + "step": 7430 + }, + { + "epoch": 5.456545654565456, + "grad_norm": 1.5476195812225342, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 7440 + }, + { + "epoch": 5.463879721305464, + "grad_norm": 0.9879506826400757, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 7450 + }, + { + "epoch": 5.471213788045471, + "grad_norm": 1.2562980651855469, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 7460 + }, + { + "epoch": 5.478547854785479, + "grad_norm": 1.3051384687423706, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 7470 + }, + { + "epoch": 5.485881921525486, + "grad_norm": 1.0511597394943237, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 7480 + }, + { + "epoch": 5.493215988265494, + "grad_norm": 1.0380817651748657, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 7490 + }, + { + "epoch": 5.5005500550055, + "grad_norm": 1.170274257659912, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 7500 + }, + { + "epoch": 5.507884121745508, + "grad_norm": 1.3356517553329468, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 7510 + }, + { + "epoch": 5.515218188485515, + "grad_norm": 1.0727124214172363, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 7520 + }, + { + "epoch": 5.522552255225523, + "grad_norm": 1.0110199451446533, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 7530 + }, + { + "epoch": 5.52988632196553, + "grad_norm": 1.3086743354797363, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 7540 + }, + { + "epoch": 5.537220388705538, + "grad_norm": 1.1904916763305664, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 7550 + }, + { + "epoch": 5.544554455445544, + "grad_norm": 0.9466280937194824, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 7560 + }, + { + "epoch": 5.551888522185552, + "grad_norm": 1.1237901449203491, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 7570 + }, + { + "epoch": 5.559222588925559, + "grad_norm": 0.9590660333633423, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 7580 + }, + { + "epoch": 5.566556655665567, + "grad_norm": 1.0890778303146362, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 7590 + }, + { + "epoch": 5.573890722405574, + "grad_norm": 0.7206931114196777, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 7600 + }, + { + "epoch": 5.5812247891455815, + "grad_norm": 1.2884514331817627, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 7610 + }, + { + "epoch": 5.588558855885589, + "grad_norm": 0.7798039317131042, + "learning_rate": 0.0002, + "loss": 0.5279, + "step": 7620 + }, + { + "epoch": 5.595892922625596, + "grad_norm": 1.166046142578125, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 7630 + }, + { + "epoch": 5.603226989365603, + "grad_norm": 1.0150201320648193, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 7640 + }, + { + "epoch": 5.6105610561056105, + "grad_norm": 1.0449682474136353, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 7650 + }, + { + "epoch": 5.617895122845618, + "grad_norm": 0.9310530424118042, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 7660 + }, + { + "epoch": 5.6252291895856255, + "grad_norm": 0.9117933511734009, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 7670 + }, + { + "epoch": 5.632563256325633, + "grad_norm": 1.1475164890289307, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 7680 + }, + { + "epoch": 5.6398973230656395, + "grad_norm": 1.066809058189392, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 7690 + }, + { + "epoch": 5.647231389805647, + "grad_norm": 1.2834991216659546, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 7700 + }, + { + "epoch": 5.6545654565456545, + "grad_norm": 1.2245112657546997, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 7710 + }, + { + "epoch": 5.661899523285662, + "grad_norm": 1.1424106359481812, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 7720 + }, + { + "epoch": 5.669233590025669, + "grad_norm": 1.0673892498016357, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 7730 + }, + { + "epoch": 5.676567656765677, + "grad_norm": 1.4312121868133545, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 7740 + }, + { + "epoch": 5.683901723505684, + "grad_norm": 0.9976982474327087, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 7750 + }, + { + "epoch": 5.691235790245691, + "grad_norm": 0.9464678168296814, + "learning_rate": 0.0002, + "loss": 0.4855, + "step": 7760 + }, + { + "epoch": 5.698569856985698, + "grad_norm": 1.010995626449585, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 7770 + }, + { + "epoch": 5.705903923725706, + "grad_norm": 1.3787750005722046, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 7780 + }, + { + "epoch": 5.713237990465713, + "grad_norm": 1.020922303199768, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 7790 + }, + { + "epoch": 5.720572057205721, + "grad_norm": 0.9748636484146118, + "learning_rate": 0.0002, + "loss": 0.5337, + "step": 7800 + }, + { + "epoch": 5.727906123945728, + "grad_norm": 1.3077744245529175, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 7810 + }, + { + "epoch": 5.735240190685735, + "grad_norm": 1.4770057201385498, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 7820 + }, + { + "epoch": 5.742574257425742, + "grad_norm": 1.6349090337753296, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 7830 + }, + { + "epoch": 5.74990832416575, + "grad_norm": 0.9818630814552307, + "learning_rate": 0.0002, + "loss": 0.5056, + "step": 7840 + }, + { + "epoch": 5.757242390905757, + "grad_norm": 0.9659715890884399, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 7850 + }, + { + "epoch": 5.764576457645765, + "grad_norm": 0.9269950985908508, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 7860 + }, + { + "epoch": 5.771910524385772, + "grad_norm": 1.0099073648452759, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 7870 + }, + { + "epoch": 5.77924459112578, + "grad_norm": 0.9123615026473999, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 7880 + }, + { + "epoch": 5.786578657865786, + "grad_norm": 1.1542246341705322, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 7890 + }, + { + "epoch": 5.793912724605794, + "grad_norm": 1.0792022943496704, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 7900 + }, + { + "epoch": 5.801246791345801, + "grad_norm": 0.95615553855896, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 7910 + }, + { + "epoch": 5.808580858085809, + "grad_norm": 1.2471332550048828, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 7920 + }, + { + "epoch": 5.815914924825816, + "grad_norm": 1.0189851522445679, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 7930 + }, + { + "epoch": 5.823248991565823, + "grad_norm": 1.3309742212295532, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 7940 + }, + { + "epoch": 5.83058305830583, + "grad_norm": 1.2930549383163452, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 7950 + }, + { + "epoch": 5.837917125045838, + "grad_norm": 0.8216308951377869, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 7960 + }, + { + "epoch": 5.845251191785845, + "grad_norm": 1.1205775737762451, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 7970 + }, + { + "epoch": 5.852585258525853, + "grad_norm": 0.851298451423645, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 7980 + }, + { + "epoch": 5.85991932526586, + "grad_norm": 0.8797095417976379, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 7990 + }, + { + "epoch": 5.867253392005868, + "grad_norm": 1.5784614086151123, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 8000 + }, + { + "epoch": 5.874587458745875, + "grad_norm": 1.1531187295913696, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 8010 + }, + { + "epoch": 5.881921525485882, + "grad_norm": 1.2469146251678467, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 8020 + }, + { + "epoch": 5.889255592225889, + "grad_norm": 1.0784350633621216, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 8030 + }, + { + "epoch": 5.896589658965897, + "grad_norm": 1.1311599016189575, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 8040 + }, + { + "epoch": 5.903923725705904, + "grad_norm": 0.9654512405395508, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 8050 + }, + { + "epoch": 5.9112577924459115, + "grad_norm": 1.3288270235061646, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 8060 + }, + { + "epoch": 5.918591859185918, + "grad_norm": 1.12800931930542, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 8070 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 0.9449917674064636, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 8080 + }, + { + "epoch": 5.933259992665933, + "grad_norm": 1.1532357931137085, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 8090 + }, + { + "epoch": 5.9405940594059405, + "grad_norm": 1.2211151123046875, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 8100 + }, + { + "epoch": 5.947928126145948, + "grad_norm": 1.3459105491638184, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 8110 + }, + { + "epoch": 5.9552621928859555, + "grad_norm": 1.251999855041504, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 8120 + }, + { + "epoch": 5.962596259625963, + "grad_norm": 1.5682506561279297, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 8130 + }, + { + "epoch": 5.9699303263659695, + "grad_norm": 0.926075279712677, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 8140 + }, + { + "epoch": 5.977264393105977, + "grad_norm": 0.9622511863708496, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 8150 + }, + { + "epoch": 5.9845984598459845, + "grad_norm": 0.9633373618125916, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 8160 + }, + { + "epoch": 5.991932526585992, + "grad_norm": 0.8960476517677307, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 8170 + }, + { + "epoch": 5.999266593325999, + "grad_norm": 0.9372805953025818, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 8180 + }, + { + "epoch": 6.0, + "eval_loss": 1.3233846426010132, + "eval_runtime": 32.7419, + "eval_samples_per_second": 13.164, + "eval_steps_per_second": 1.649, + "step": 8181 + }, + { + "epoch": 6.006600660066007, + "grad_norm": 1.1900787353515625, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 8190 + }, + { + "epoch": 6.013934726806014, + "grad_norm": 1.1448326110839844, + "learning_rate": 0.0002, + "loss": 0.4509, + "step": 8200 + }, + { + "epoch": 6.021268793546021, + "grad_norm": 1.1848368644714355, + "learning_rate": 0.0002, + "loss": 0.3667, + "step": 8210 + }, + { + "epoch": 6.028602860286028, + "grad_norm": 1.2315572500228882, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 8220 + }, + { + "epoch": 6.035936927026036, + "grad_norm": 1.2214244604110718, + "learning_rate": 0.0002, + "loss": 0.3541, + "step": 8230 + }, + { + "epoch": 6.043270993766043, + "grad_norm": 0.9455513954162598, + "learning_rate": 0.0002, + "loss": 0.4025, + "step": 8240 + }, + { + "epoch": 6.050605060506051, + "grad_norm": 0.9574248790740967, + "learning_rate": 0.0002, + "loss": 0.4448, + "step": 8250 + }, + { + "epoch": 6.057939127246058, + "grad_norm": 1.1022400856018066, + "learning_rate": 0.0002, + "loss": 0.4271, + "step": 8260 + }, + { + "epoch": 6.065273193986065, + "grad_norm": 0.9555122256278992, + "learning_rate": 0.0002, + "loss": 0.3603, + "step": 8270 + }, + { + "epoch": 6.072607260726072, + "grad_norm": 1.1956106424331665, + "learning_rate": 0.0002, + "loss": 0.4324, + "step": 8280 + }, + { + "epoch": 6.07994132746608, + "grad_norm": 1.3110876083374023, + "learning_rate": 0.0002, + "loss": 0.3924, + "step": 8290 + }, + { + "epoch": 6.087275394206087, + "grad_norm": 1.1293374300003052, + "learning_rate": 0.0002, + "loss": 0.3664, + "step": 8300 + }, + { + "epoch": 6.094609460946095, + "grad_norm": 0.9176164269447327, + "learning_rate": 0.0002, + "loss": 0.385, + "step": 8310 + }, + { + "epoch": 6.101943527686102, + "grad_norm": 0.9751231670379639, + "learning_rate": 0.0002, + "loss": 0.4142, + "step": 8320 + }, + { + "epoch": 6.109277594426109, + "grad_norm": 1.0536044836044312, + "learning_rate": 0.0002, + "loss": 0.4356, + "step": 8330 + }, + { + "epoch": 6.116611661166116, + "grad_norm": 1.289342999458313, + "learning_rate": 0.0002, + "loss": 0.409, + "step": 8340 + }, + { + "epoch": 6.123945727906124, + "grad_norm": 1.1773661375045776, + "learning_rate": 0.0002, + "loss": 0.4121, + "step": 8350 + }, + { + "epoch": 6.131279794646131, + "grad_norm": 1.2450661659240723, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 8360 + }, + { + "epoch": 6.138613861386139, + "grad_norm": 1.3965914249420166, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 8370 + }, + { + "epoch": 6.145947928126146, + "grad_norm": 1.3530808687210083, + "learning_rate": 0.0002, + "loss": 0.4024, + "step": 8380 + }, + { + "epoch": 6.153281994866154, + "grad_norm": 1.296276330947876, + "learning_rate": 0.0002, + "loss": 0.4658, + "step": 8390 + }, + { + "epoch": 6.16061606160616, + "grad_norm": 0.9759053587913513, + "learning_rate": 0.0002, + "loss": 0.5073, + "step": 8400 + }, + { + "epoch": 6.167950128346168, + "grad_norm": 1.2110707759857178, + "learning_rate": 0.0002, + "loss": 0.4718, + "step": 8410 + }, + { + "epoch": 6.175284195086175, + "grad_norm": 1.312226414680481, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 8420 + }, + { + "epoch": 6.182618261826183, + "grad_norm": 1.1696736812591553, + "learning_rate": 0.0002, + "loss": 0.4183, + "step": 8430 + }, + { + "epoch": 6.18995232856619, + "grad_norm": 1.260304570198059, + "learning_rate": 0.0002, + "loss": 0.4546, + "step": 8440 + }, + { + "epoch": 6.197286395306198, + "grad_norm": 1.472961187362671, + "learning_rate": 0.0002, + "loss": 0.4137, + "step": 8450 + }, + { + "epoch": 6.204620462046204, + "grad_norm": 1.3618475198745728, + "learning_rate": 0.0002, + "loss": 0.42, + "step": 8460 + }, + { + "epoch": 6.211954528786212, + "grad_norm": 1.2544318437576294, + "learning_rate": 0.0002, + "loss": 0.415, + "step": 8470 + }, + { + "epoch": 6.219288595526219, + "grad_norm": 1.205898642539978, + "learning_rate": 0.0002, + "loss": 0.3907, + "step": 8480 + }, + { + "epoch": 6.226622662266227, + "grad_norm": 0.9984724521636963, + "learning_rate": 0.0002, + "loss": 0.4431, + "step": 8490 + }, + { + "epoch": 6.233956729006234, + "grad_norm": 1.3184109926223755, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 8500 + }, + { + "epoch": 6.241290795746242, + "grad_norm": 1.135520100593567, + "learning_rate": 0.0002, + "loss": 0.3859, + "step": 8510 + }, + { + "epoch": 6.248624862486249, + "grad_norm": 1.4528400897979736, + "learning_rate": 0.0002, + "loss": 0.4159, + "step": 8520 + }, + { + "epoch": 6.255958929226256, + "grad_norm": 1.1222716569900513, + "learning_rate": 0.0002, + "loss": 0.4347, + "step": 8530 + }, + { + "epoch": 6.263292995966263, + "grad_norm": 1.7878046035766602, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 8540 + }, + { + "epoch": 6.270627062706271, + "grad_norm": 0.9789481163024902, + "learning_rate": 0.0002, + "loss": 0.4298, + "step": 8550 + }, + { + "epoch": 6.277961129446278, + "grad_norm": 1.151977300643921, + "learning_rate": 0.0002, + "loss": 0.4316, + "step": 8560 + }, + { + "epoch": 6.2852951961862855, + "grad_norm": 1.389968752861023, + "learning_rate": 0.0002, + "loss": 0.428, + "step": 8570 + }, + { + "epoch": 6.292629262926293, + "grad_norm": 0.884211003780365, + "learning_rate": 0.0002, + "loss": 0.3903, + "step": 8580 + }, + { + "epoch": 6.2999633296663, + "grad_norm": 1.3604296445846558, + "learning_rate": 0.0002, + "loss": 0.4611, + "step": 8590 + }, + { + "epoch": 6.307297396406307, + "grad_norm": 1.1845694780349731, + "learning_rate": 0.0002, + "loss": 0.4183, + "step": 8600 + }, + { + "epoch": 6.3146314631463145, + "grad_norm": 1.3231550455093384, + "learning_rate": 0.0002, + "loss": 0.472, + "step": 8610 + }, + { + "epoch": 6.321965529886322, + "grad_norm": 0.9546721577644348, + "learning_rate": 0.0002, + "loss": 0.3922, + "step": 8620 + }, + { + "epoch": 6.3292995966263295, + "grad_norm": 1.2329787015914917, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 8630 + }, + { + "epoch": 6.336633663366337, + "grad_norm": 1.0240199565887451, + "learning_rate": 0.0002, + "loss": 0.4344, + "step": 8640 + }, + { + "epoch": 6.343967730106344, + "grad_norm": 1.1866962909698486, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 8650 + }, + { + "epoch": 6.351301796846351, + "grad_norm": 1.2819687128067017, + "learning_rate": 0.0002, + "loss": 0.4575, + "step": 8660 + }, + { + "epoch": 6.3586358635863585, + "grad_norm": 0.9654944539070129, + "learning_rate": 0.0002, + "loss": 0.455, + "step": 8670 + }, + { + "epoch": 6.365969930326366, + "grad_norm": 0.9443874955177307, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 8680 + }, + { + "epoch": 6.373303997066373, + "grad_norm": 1.2914115190505981, + "learning_rate": 0.0002, + "loss": 0.435, + "step": 8690 + }, + { + "epoch": 6.380638063806381, + "grad_norm": 1.4558709859848022, + "learning_rate": 0.0002, + "loss": 0.4392, + "step": 8700 + }, + { + "epoch": 6.387972130546388, + "grad_norm": 1.3255952596664429, + "learning_rate": 0.0002, + "loss": 0.4398, + "step": 8710 + }, + { + "epoch": 6.395306197286395, + "grad_norm": 1.348742961883545, + "learning_rate": 0.0002, + "loss": 0.4451, + "step": 8720 + }, + { + "epoch": 6.402640264026402, + "grad_norm": 1.0096025466918945, + "learning_rate": 0.0002, + "loss": 0.41, + "step": 8730 + }, + { + "epoch": 6.40997433076641, + "grad_norm": 1.1720590591430664, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 8740 + }, + { + "epoch": 6.417308397506417, + "grad_norm": 1.1803077459335327, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 8750 + }, + { + "epoch": 6.424642464246425, + "grad_norm": 1.3649998903274536, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 8760 + }, + { + "epoch": 6.431976530986432, + "grad_norm": 1.1503992080688477, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 8770 + }, + { + "epoch": 6.43931059772644, + "grad_norm": 1.1537176370620728, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 8780 + }, + { + "epoch": 6.446644664466446, + "grad_norm": 0.9743003845214844, + "learning_rate": 0.0002, + "loss": 0.4167, + "step": 8790 + }, + { + "epoch": 6.453978731206454, + "grad_norm": 0.9097744822502136, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 8800 + }, + { + "epoch": 6.461312797946461, + "grad_norm": 2.0174002647399902, + "learning_rate": 0.0002, + "loss": 0.4809, + "step": 8810 + }, + { + "epoch": 6.468646864686469, + "grad_norm": 1.0809309482574463, + "learning_rate": 0.0002, + "loss": 0.4879, + "step": 8820 + }, + { + "epoch": 6.475980931426476, + "grad_norm": 1.100294828414917, + "learning_rate": 0.0002, + "loss": 0.4235, + "step": 8830 + }, + { + "epoch": 6.483314998166484, + "grad_norm": 1.3707489967346191, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 8840 + }, + { + "epoch": 6.49064906490649, + "grad_norm": 1.1304761171340942, + "learning_rate": 0.0002, + "loss": 0.4533, + "step": 8850 + }, + { + "epoch": 6.497983131646498, + "grad_norm": 1.2171573638916016, + "learning_rate": 0.0002, + "loss": 0.4596, + "step": 8860 + }, + { + "epoch": 6.505317198386505, + "grad_norm": 1.0452901124954224, + "learning_rate": 0.0002, + "loss": 0.4694, + "step": 8870 + }, + { + "epoch": 6.512651265126513, + "grad_norm": 1.197298526763916, + "learning_rate": 0.0002, + "loss": 0.4855, + "step": 8880 + }, + { + "epoch": 6.51998533186652, + "grad_norm": 0.9179880619049072, + "learning_rate": 0.0002, + "loss": 0.4167, + "step": 8890 + }, + { + "epoch": 6.527319398606528, + "grad_norm": 1.415079951286316, + "learning_rate": 0.0002, + "loss": 0.445, + "step": 8900 + }, + { + "epoch": 6.534653465346535, + "grad_norm": 1.1032487154006958, + "learning_rate": 0.0002, + "loss": 0.424, + "step": 8910 + }, + { + "epoch": 6.541987532086542, + "grad_norm": 1.2295007705688477, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 8920 + }, + { + "epoch": 6.549321598826549, + "grad_norm": 1.4223219156265259, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 8930 + }, + { + "epoch": 6.556655665566557, + "grad_norm": 1.2785786390304565, + "learning_rate": 0.0002, + "loss": 0.4597, + "step": 8940 + }, + { + "epoch": 6.563989732306564, + "grad_norm": 1.3514775037765503, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 8950 + }, + { + "epoch": 6.571323799046572, + "grad_norm": 1.107937216758728, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 8960 + }, + { + "epoch": 6.578657865786578, + "grad_norm": 1.2839902639389038, + "learning_rate": 0.0002, + "loss": 0.4954, + "step": 8970 + }, + { + "epoch": 6.585991932526586, + "grad_norm": 0.9793244004249573, + "learning_rate": 0.0002, + "loss": 0.4207, + "step": 8980 + }, + { + "epoch": 6.593325999266593, + "grad_norm": 1.3403126001358032, + "learning_rate": 0.0002, + "loss": 0.4989, + "step": 8990 + }, + { + "epoch": 6.600660066006601, + "grad_norm": 1.2612813711166382, + "learning_rate": 0.0002, + "loss": 0.465, + "step": 9000 + }, + { + "epoch": 6.607994132746608, + "grad_norm": 1.4347625970840454, + "learning_rate": 0.0002, + "loss": 0.4589, + "step": 9010 + }, + { + "epoch": 6.6153281994866155, + "grad_norm": 1.225921869277954, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 9020 + }, + { + "epoch": 6.622662266226623, + "grad_norm": 1.033644676208496, + "learning_rate": 0.0002, + "loss": 0.4364, + "step": 9030 + }, + { + "epoch": 6.6299963329666305, + "grad_norm": 1.1791894435882568, + "learning_rate": 0.0002, + "loss": 0.4698, + "step": 9040 + }, + { + "epoch": 6.637330399706637, + "grad_norm": 1.0968137979507446, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 9050 + }, + { + "epoch": 6.6446644664466445, + "grad_norm": 1.5639140605926514, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 9060 + }, + { + "epoch": 6.651998533186652, + "grad_norm": 1.4158905744552612, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 9070 + }, + { + "epoch": 6.6593325999266595, + "grad_norm": 1.2120254039764404, + "learning_rate": 0.0002, + "loss": 0.4619, + "step": 9080 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.1866531372070312, + "learning_rate": 0.0002, + "loss": 0.4564, + "step": 9090 + }, + { + "epoch": 6.6740007334066735, + "grad_norm": 1.2704026699066162, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 9100 + }, + { + "epoch": 6.681334800146681, + "grad_norm": 1.1878353357315063, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 9110 + }, + { + "epoch": 6.6886688668866885, + "grad_norm": 1.193995475769043, + "learning_rate": 0.0002, + "loss": 0.4657, + "step": 9120 + }, + { + "epoch": 6.696002933626696, + "grad_norm": 1.2927545309066772, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 9130 + }, + { + "epoch": 6.703337000366703, + "grad_norm": 1.0770703554153442, + "learning_rate": 0.0002, + "loss": 0.4157, + "step": 9140 + }, + { + "epoch": 6.710671067106711, + "grad_norm": 1.2200851440429688, + "learning_rate": 0.0002, + "loss": 0.4571, + "step": 9150 + }, + { + "epoch": 6.718005133846718, + "grad_norm": 1.293891191482544, + "learning_rate": 0.0002, + "loss": 0.4605, + "step": 9160 + }, + { + "epoch": 6.725339200586725, + "grad_norm": 1.9376052618026733, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 9170 + }, + { + "epoch": 6.732673267326732, + "grad_norm": 1.0353254079818726, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 9180 + }, + { + "epoch": 6.74000733406674, + "grad_norm": 1.1274057626724243, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 9190 + }, + { + "epoch": 6.747341400806747, + "grad_norm": 1.3344064950942993, + "learning_rate": 0.0002, + "loss": 0.4486, + "step": 9200 + }, + { + "epoch": 6.754675467546755, + "grad_norm": 1.303621768951416, + "learning_rate": 0.0002, + "loss": 0.49, + "step": 9210 + }, + { + "epoch": 6.762009534286762, + "grad_norm": 1.2327780723571777, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 9220 + }, + { + "epoch": 6.769343601026769, + "grad_norm": 1.3513109683990479, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 9230 + }, + { + "epoch": 6.776677667766776, + "grad_norm": 1.4762850999832153, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 9240 + }, + { + "epoch": 6.784011734506784, + "grad_norm": 1.0967189073562622, + "learning_rate": 0.0002, + "loss": 0.4181, + "step": 9250 + }, + { + "epoch": 6.791345801246791, + "grad_norm": 0.933936357498169, + "learning_rate": 0.0002, + "loss": 0.4862, + "step": 9260 + }, + { + "epoch": 6.798679867986799, + "grad_norm": 1.065553903579712, + "learning_rate": 0.0002, + "loss": 0.4667, + "step": 9270 + }, + { + "epoch": 6.806013934726806, + "grad_norm": 1.2044163942337036, + "learning_rate": 0.0002, + "loss": 0.5164, + "step": 9280 + }, + { + "epoch": 6.813348001466814, + "grad_norm": 1.404137134552002, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 9290 + }, + { + "epoch": 6.82068206820682, + "grad_norm": 1.4005582332611084, + "learning_rate": 0.0002, + "loss": 0.4442, + "step": 9300 + }, + { + "epoch": 6.828016134946828, + "grad_norm": 1.1771104335784912, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 9310 + }, + { + "epoch": 6.835350201686835, + "grad_norm": 1.191933035850525, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 9320 + }, + { + "epoch": 6.842684268426843, + "grad_norm": 1.3395432233810425, + "learning_rate": 0.0002, + "loss": 0.4733, + "step": 9330 + }, + { + "epoch": 6.85001833516685, + "grad_norm": 1.4145503044128418, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 9340 + }, + { + "epoch": 6.857352401906858, + "grad_norm": 1.1128839254379272, + "learning_rate": 0.0002, + "loss": 0.4872, + "step": 9350 + }, + { + "epoch": 6.864686468646864, + "grad_norm": 1.0771174430847168, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 9360 + }, + { + "epoch": 6.872020535386872, + "grad_norm": 1.1089814901351929, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 9370 + }, + { + "epoch": 6.879354602126879, + "grad_norm": 1.078444004058838, + "learning_rate": 0.0002, + "loss": 0.4854, + "step": 9380 + }, + { + "epoch": 6.886688668866887, + "grad_norm": 1.3676636219024658, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 9390 + }, + { + "epoch": 6.894022735606894, + "grad_norm": 0.8973749876022339, + "learning_rate": 0.0002, + "loss": 0.4854, + "step": 9400 + }, + { + "epoch": 6.901356802346902, + "grad_norm": 1.141552448272705, + "learning_rate": 0.0002, + "loss": 0.4274, + "step": 9410 + }, + { + "epoch": 6.908690869086909, + "grad_norm": 0.8345359563827515, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 9420 + }, + { + "epoch": 6.916024935826916, + "grad_norm": 1.1602197885513306, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 9430 + }, + { + "epoch": 6.923359002566923, + "grad_norm": 1.275466799736023, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 9440 + }, + { + "epoch": 6.930693069306931, + "grad_norm": 0.9186071157455444, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 9450 + }, + { + "epoch": 6.938027136046938, + "grad_norm": 0.9069198966026306, + "learning_rate": 0.0002, + "loss": 0.4604, + "step": 9460 + }, + { + "epoch": 6.945361202786946, + "grad_norm": 1.2331899404525757, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 9470 + }, + { + "epoch": 6.952695269526953, + "grad_norm": 0.8685150742530823, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 9480 + }, + { + "epoch": 6.96002933626696, + "grad_norm": 1.4067939519882202, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 9490 + }, + { + "epoch": 6.967363403006967, + "grad_norm": 1.1864029169082642, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 9500 + }, + { + "epoch": 6.974697469746975, + "grad_norm": 1.3697725534439087, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 9510 + }, + { + "epoch": 6.982031536486982, + "grad_norm": 1.1632893085479736, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 9520 + }, + { + "epoch": 6.9893656032269895, + "grad_norm": 1.1447268724441528, + "learning_rate": 0.0002, + "loss": 0.4526, + "step": 9530 + }, + { + "epoch": 6.996699669966997, + "grad_norm": 1.5017213821411133, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 9540 + }, + { + "epoch": 6.999633296662999, + "eval_loss": 1.4178194999694824, + "eval_runtime": 32.7488, + "eval_samples_per_second": 13.161, + "eval_steps_per_second": 1.649, + "step": 9544 + } + ], + "logging_steps": 10, + "max_steps": 10904, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.416980889989284e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-9544/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2868cff7027115396e695775cacd838522aca295 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b6f6817632087b5a5e37d744e25312b96e839de5005320b96bc0c2473c41f +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bcd616a1f7d0877989def9df545f8a8fe95fdfb0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9996332966629996, "step": 1363, "epoch_duration": 1472.4565567970276, "total_accumulated_duration": 1472.4565567970276, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9722, "grad_norm": 0.47521963715553284, "learning_rate": 0.0002, "epoch": 0.007334066740007334, "step": 10}, {"loss": 1.4821, "grad_norm": 0.5395162105560303, "learning_rate": 0.0002, "epoch": 0.014668133480014669, "step": 20}, {"loss": 1.4202, "grad_norm": 0.4305780231952667, "learning_rate": 0.0002, "epoch": 0.022002200220022004, "step": 30}, {"loss": 1.4271, "grad_norm": 0.6938246488571167, "learning_rate": 0.0002, "epoch": 0.029336266960029337, "step": 40}, {"loss": 1.3112, "grad_norm": 1.5133819580078125, "learning_rate": 0.0002, "epoch": 0.03667033370003667, "step": 50}, {"loss": 1.3132, "grad_norm": 0.9173883199691772, "learning_rate": 0.0002, "epoch": 0.04400440044004401, "step": 60}, {"loss": 1.2844, "grad_norm": 0.4619861841201782, "learning_rate": 0.0002, "epoch": 0.05133846718005134, "step": 70}, {"loss": 1.2108, "grad_norm": 0.46118637919425964, "learning_rate": 0.0002, "epoch": 0.058672533920058674, "step": 80}, {"loss": 1.3441, "grad_norm": 0.4468648135662079, "learning_rate": 0.0002, "epoch": 0.066006600660066, "step": 90}, {"loss": 1.1863, "grad_norm": 0.46123769879341125, "learning_rate": 0.0002, "epoch": 0.07334066740007333, "step": 100}, {"loss": 1.2772, "grad_norm": 0.4859139025211334, "learning_rate": 0.0002, "epoch": 0.08067473414008068, "step": 110}, {"loss": 1.2087, "grad_norm": 0.4384922385215759, "learning_rate": 0.0002, "epoch": 0.08800880088008801, "step": 120}, {"loss": 1.2927, "grad_norm": 0.39519360661506653, "learning_rate": 0.0002, "epoch": 0.09534286762009535, "step": 130}, {"loss": 1.2349, "grad_norm": 0.4049859344959259, "learning_rate": 0.0002, "epoch": 0.10267693436010268, "step": 140}, {"loss": 1.293, "grad_norm": 0.4605638086795807, "learning_rate": 0.0002, "epoch": 0.11001100110011001, "step": 150}, {"loss": 1.2659, "grad_norm": 0.4201928377151489, "learning_rate": 0.0002, "epoch": 0.11734506784011735, "step": 160}, {"loss": 1.3961, "grad_norm": 0.5367777347564697, "learning_rate": 0.0002, "epoch": 0.12467913458012468, "step": 170}, {"loss": 1.2481, "grad_norm": 0.41752299666404724, "learning_rate": 0.0002, "epoch": 0.132013201320132, "step": 180}, {"loss": 1.207, "grad_norm": 0.31597763299942017, "learning_rate": 0.0002, "epoch": 0.13934726806013933, "step": 190}, {"loss": 1.2441, "grad_norm": 0.7468788623809814, "learning_rate": 0.0002, "epoch": 0.14668133480014667, "step": 200}, {"loss": 1.199, "grad_norm": 0.3403034508228302, "learning_rate": 0.0002, "epoch": 0.15401540154015403, "step": 210}, {"loss": 1.2439, "grad_norm": 0.34240293502807617, "learning_rate": 0.0002, "epoch": 0.16134946828016136, "step": 220}, {"loss": 1.2022, "grad_norm": 0.356158971786499, "learning_rate": 0.0002, "epoch": 0.1686835350201687, "step": 230}, {"loss": 1.207, "grad_norm": 0.3448857367038727, "learning_rate": 0.0002, "epoch": 0.17601760176017603, "step": 240}, {"loss": 1.2156, "grad_norm": 0.3475699722766876, "learning_rate": 0.0002, "epoch": 0.18335166850018336, "step": 250}, {"loss": 1.1551, "grad_norm": 0.2770358622074127, "learning_rate": 0.0002, "epoch": 0.1906857352401907, "step": 260}, {"loss": 1.2238, "grad_norm": 0.4310270845890045, "learning_rate": 0.0002, "epoch": 0.19801980198019803, "step": 270}, {"loss": 1.2917, "grad_norm": 0.335041880607605, "learning_rate": 0.0002, "epoch": 0.20535386872020536, "step": 280}, {"loss": 1.0959, "grad_norm": 0.3420602083206177, "learning_rate": 0.0002, "epoch": 0.2126879354602127, "step": 290}, {"loss": 1.1232, "grad_norm": 0.325001060962677, "learning_rate": 0.0002, "epoch": 0.22002200220022003, "step": 300}, {"loss": 1.2007, "grad_norm": 0.3027827739715576, "learning_rate": 0.0002, "epoch": 0.22735606894022736, "step": 310}, {"loss": 1.1803, "grad_norm": 0.435550719499588, "learning_rate": 0.0002, "epoch": 0.2346901356802347, "step": 320}, {"loss": 1.2045, "grad_norm": 0.3884522616863251, "learning_rate": 0.0002, "epoch": 0.24202420242024203, "step": 330}, {"loss": 1.2481, "grad_norm": 0.7736002206802368, "learning_rate": 0.0002, "epoch": 0.24935826916024936, "step": 340}, {"loss": 1.3606, "grad_norm": 0.35052821040153503, "learning_rate": 0.0002, "epoch": 0.2566923359002567, "step": 350}, {"loss": 1.2129, "grad_norm": 0.3311890959739685, "learning_rate": 0.0002, "epoch": 0.264026402640264, "step": 360}, {"loss": 1.2219, "grad_norm": 0.7473500370979309, "learning_rate": 0.0002, "epoch": 0.27136046938027136, "step": 370}, {"loss": 1.2712, "grad_norm": 0.3681875765323639, "learning_rate": 0.0002, "epoch": 0.27869453612027867, "step": 380}, {"loss": 1.2258, "grad_norm": 0.3764737844467163, "learning_rate": 0.0002, "epoch": 0.28602860286028603, "step": 390}, {"loss": 1.1917, "grad_norm": 0.4243989586830139, "learning_rate": 0.0002, "epoch": 0.29336266960029334, "step": 400}, {"loss": 1.199, "grad_norm": 0.2658531963825226, "learning_rate": 0.0002, "epoch": 0.3006967363403007, "step": 410}, {"loss": 1.1622, "grad_norm": 0.3436793386936188, "learning_rate": 0.0002, "epoch": 0.30803080308030806, "step": 420}, {"loss": 1.2953, "grad_norm": 0.5101129412651062, "learning_rate": 0.0002, "epoch": 0.31536486982031536, "step": 430}, {"loss": 1.1557, "grad_norm": 0.3319750726222992, "learning_rate": 0.0002, "epoch": 0.3226989365603227, "step": 440}, {"loss": 1.1804, "grad_norm": 0.385148286819458, "learning_rate": 0.0002, "epoch": 0.33003300330033003, "step": 450}, {"loss": 1.1808, "grad_norm": 0.3477935791015625, "learning_rate": 0.0002, "epoch": 0.3373670700403374, "step": 460}, {"loss": 1.1877, "grad_norm": 0.29748716950416565, "learning_rate": 0.0002, "epoch": 0.3447011367803447, "step": 470}, {"loss": 1.19, "grad_norm": 0.34083324670791626, "learning_rate": 0.0002, "epoch": 0.35203520352035206, "step": 480}, {"loss": 1.2, "grad_norm": 0.36904552578926086, "learning_rate": 0.0002, "epoch": 0.35936927026035936, "step": 490}, {"loss": 1.2223, "grad_norm": 0.315483033657074, "learning_rate": 0.0002, "epoch": 0.3667033370003667, "step": 500}, {"loss": 1.1461, "grad_norm": 0.44897955656051636, "learning_rate": 0.0002, "epoch": 0.37403740374037403, "step": 510}, {"loss": 1.3035, "grad_norm": 0.3160701394081116, "learning_rate": 0.0002, "epoch": 0.3813714704803814, "step": 520}, {"loss": 1.3197, "grad_norm": 0.29584741592407227, "learning_rate": 0.0002, "epoch": 0.3887055372203887, "step": 530}, {"loss": 1.2983, "grad_norm": 0.5430002808570862, "learning_rate": 0.0002, "epoch": 0.39603960396039606, "step": 540}, {"loss": 1.2459, "grad_norm": 0.2908070683479309, "learning_rate": 0.0002, "epoch": 0.40337367070040336, "step": 550}, {"loss": 1.2384, "grad_norm": 0.35066530108451843, "learning_rate": 0.0002, "epoch": 0.4107077374404107, "step": 560}, {"loss": 1.1784, "grad_norm": 0.37588003277778625, "learning_rate": 0.0002, "epoch": 0.41804180418041803, "step": 570}, {"loss": 1.2334, "grad_norm": 0.3112126886844635, "learning_rate": 0.0002, "epoch": 0.4253758709204254, "step": 580}, {"loss": 1.1439, "grad_norm": 0.35577139258384705, "learning_rate": 0.0002, "epoch": 0.4327099376604327, "step": 590}, {"loss": 1.184, "grad_norm": 0.31706422567367554, "learning_rate": 0.0002, "epoch": 0.44004400440044006, "step": 600}, {"loss": 1.2081, "grad_norm": 0.3249092102050781, "learning_rate": 0.0002, "epoch": 0.44737807114044736, "step": 610}, {"loss": 1.0824, "grad_norm": 0.3842705488204956, "learning_rate": 0.0002, "epoch": 0.4547121378804547, "step": 620}, {"loss": 1.2257, "grad_norm": 0.390991747379303, "learning_rate": 0.0002, "epoch": 0.46204620462046203, "step": 630}, {"loss": 1.1954, "grad_norm": 0.27532413601875305, "learning_rate": 0.0002, "epoch": 0.4693802713604694, "step": 640}, {"loss": 1.1058, "grad_norm": 0.31412816047668457, "learning_rate": 0.0002, "epoch": 0.4767143381004767, "step": 650}, {"loss": 1.1312, "grad_norm": 0.32117101550102234, "learning_rate": 0.0002, "epoch": 0.48404840484048406, "step": 660}, {"loss": 1.2423, "grad_norm": 0.3810010254383087, "learning_rate": 0.0002, "epoch": 0.49138247158049136, "step": 670}, {"loss": 1.1978, "grad_norm": 0.36289164423942566, "learning_rate": 0.0002, "epoch": 0.4987165383204987, "step": 680}, {"loss": 1.2034, "grad_norm": 0.34458720684051514, "learning_rate": 0.0002, "epoch": 0.506050605060506, "step": 690}, {"loss": 1.1756, "grad_norm": 0.32844600081443787, "learning_rate": 0.0002, "epoch": 0.5133846718005134, "step": 700}, {"loss": 1.0807, "grad_norm": 0.3144175708293915, "learning_rate": 0.0002, "epoch": 0.5207187385405208, "step": 710}, {"loss": 1.1952, "grad_norm": 0.3898887634277344, "learning_rate": 0.0002, "epoch": 0.528052805280528, "step": 720}, {"loss": 1.1244, "grad_norm": 1.3220758438110352, "learning_rate": 0.0002, "epoch": 0.5353868720205354, "step": 730}, {"loss": 1.227, "grad_norm": 0.3635874390602112, "learning_rate": 0.0002, "epoch": 0.5427209387605427, "step": 740}, {"loss": 1.2169, "grad_norm": 0.3138217628002167, "learning_rate": 0.0002, "epoch": 0.5500550055005501, "step": 750}, {"loss": 1.1516, "grad_norm": 0.4063207805156708, "learning_rate": 0.0002, "epoch": 0.5573890722405573, "step": 760}, {"loss": 1.1954, "grad_norm": 0.3926219940185547, "learning_rate": 0.0002, "epoch": 0.5647231389805647, "step": 770}, {"loss": 1.1726, "grad_norm": 0.31954652070999146, "learning_rate": 0.0002, "epoch": 0.5720572057205721, "step": 780}, {"loss": 1.2977, "grad_norm": 0.4248711168766022, "learning_rate": 0.0002, "epoch": 0.5793912724605794, "step": 790}, {"loss": 1.1728, "grad_norm": 0.643004834651947, "learning_rate": 0.0002, "epoch": 0.5867253392005867, "step": 800}, {"loss": 1.1793, "grad_norm": 0.3479592800140381, "learning_rate": 0.0002, "epoch": 0.594059405940594, "step": 810}, {"loss": 1.2426, "grad_norm": 0.4684754014015198, "learning_rate": 0.0002, "epoch": 0.6013934726806014, "step": 820}, {"loss": 1.2002, "grad_norm": 0.3739790916442871, "learning_rate": 0.0002, "epoch": 0.6087275394206088, "step": 830}, {"loss": 1.2139, "grad_norm": 0.40884748101234436, "learning_rate": 0.0002, "epoch": 0.6160616061606161, "step": 840}, {"loss": 1.1557, "grad_norm": 0.9722164273262024, "learning_rate": 0.0002, "epoch": 0.6233956729006234, "step": 850}, {"loss": 1.3069, "grad_norm": 0.42992347478866577, "learning_rate": 0.0002, "epoch": 0.6307297396406307, "step": 860}, {"loss": 1.1339, "grad_norm": 0.36654195189476013, "learning_rate": 0.0002, "epoch": 0.6380638063806381, "step": 870}, {"loss": 1.1932, "grad_norm": 0.4113832116127014, "learning_rate": 0.0002, "epoch": 0.6453978731206454, "step": 880}, {"loss": 1.2163, "grad_norm": 0.2948838770389557, "learning_rate": 0.0002, "epoch": 0.6527319398606527, "step": 890}, {"loss": 1.1081, "grad_norm": 0.38330280780792236, "learning_rate": 0.0002, "epoch": 0.6600660066006601, "step": 900}, {"loss": 1.1342, "grad_norm": 0.4428867697715759, "learning_rate": 0.0002, "epoch": 0.6674000733406674, "step": 910}, {"loss": 1.1021, "grad_norm": 0.23659265041351318, "learning_rate": 0.0002, "epoch": 0.6747341400806748, "step": 920}, {"loss": 1.1226, "grad_norm": 0.323685884475708, "learning_rate": 0.0002, "epoch": 0.682068206820682, "step": 930}, {"loss": 1.0853, "grad_norm": 0.39157727360725403, "learning_rate": 0.0002, "epoch": 0.6894022735606894, "step": 940}, {"loss": 1.1435, "grad_norm": 0.27189481258392334, "learning_rate": 0.0002, "epoch": 0.6967363403006968, "step": 950}, {"loss": 1.1033, "grad_norm": 0.529883861541748, "learning_rate": 0.0002, "epoch": 0.7040704070407041, "step": 960}, {"loss": 1.139, "grad_norm": 0.34758689999580383, "learning_rate": 0.0002, "epoch": 0.7114044737807114, "step": 970}, {"loss": 1.2197, "grad_norm": 0.831749439239502, "learning_rate": 0.0002, "epoch": 0.7187385405207187, "step": 980}, {"loss": 1.158, "grad_norm": 0.4438304007053375, "learning_rate": 0.0002, "epoch": 0.7260726072607261, "step": 990}, {"loss": 1.1021, "grad_norm": 0.33840006589889526, "learning_rate": 0.0002, "epoch": 0.7334066740007334, "step": 1000}, {"loss": 1.254, "grad_norm": 0.3454797863960266, "learning_rate": 0.0002, "epoch": 0.7407407407407407, "step": 1010}, {"loss": 1.106, "grad_norm": 0.38999441266059875, "learning_rate": 0.0002, "epoch": 0.7480748074807481, "step": 1020}, {"loss": 1.1428, "grad_norm": 0.2829911708831787, "learning_rate": 0.0002, "epoch": 0.7554088742207554, "step": 1030}, {"loss": 1.2123, "grad_norm": 0.36918163299560547, "learning_rate": 0.0002, "epoch": 0.7627429409607628, "step": 1040}, {"loss": 1.3028, "grad_norm": 0.3415680229663849, "learning_rate": 0.0002, "epoch": 0.77007700770077, "step": 1050}, {"loss": 1.1939, "grad_norm": 0.2974182963371277, "learning_rate": 0.0002, "epoch": 0.7774110744407774, "step": 1060}, {"loss": 1.194, "grad_norm": 0.3880919814109802, "learning_rate": 0.0002, "epoch": 0.7847451411807848, "step": 1070}, {"loss": 1.1095, "grad_norm": 0.33503302931785583, "learning_rate": 0.0002, "epoch": 0.7920792079207921, "step": 1080}, {"loss": 1.2111, "grad_norm": 0.3728407025337219, "learning_rate": 0.0002, "epoch": 0.7994132746607994, "step": 1090}, {"loss": 1.0835, "grad_norm": 0.3509373664855957, "learning_rate": 0.0002, "epoch": 0.8067473414008067, "step": 1100}, {"loss": 1.2661, "grad_norm": 0.42228564620018005, "learning_rate": 0.0002, "epoch": 0.8140814081408141, "step": 1110}, {"loss": 1.1788, "grad_norm": 0.313467800617218, "learning_rate": 0.0002, "epoch": 0.8214154748808215, "step": 1120}, {"loss": 1.1971, "grad_norm": 0.3378850817680359, "learning_rate": 0.0002, "epoch": 0.8287495416208287, "step": 1130}, {"loss": 1.1238, "grad_norm": 0.43200382590293884, "learning_rate": 0.0002, "epoch": 0.8360836083608361, "step": 1140}, {"loss": 1.3203, "grad_norm": 0.3309599459171295, "learning_rate": 0.0002, "epoch": 0.8434176751008434, "step": 1150}, {"loss": 1.1062, "grad_norm": 0.3526846170425415, "learning_rate": 0.0002, "epoch": 0.8507517418408508, "step": 1160}, {"loss": 1.0851, "grad_norm": 1.2722247838974, "learning_rate": 0.0002, "epoch": 0.858085808580858, "step": 1170}, {"loss": 1.0785, "grad_norm": 0.34142059087753296, "learning_rate": 0.0002, "epoch": 0.8654198753208654, "step": 1180}, {"loss": 1.2187, "grad_norm": 0.3805823028087616, "learning_rate": 0.0002, "epoch": 0.8727539420608728, "step": 1190}, {"loss": 1.1215, "grad_norm": 0.3931232690811157, "learning_rate": 0.0002, "epoch": 0.8800880088008801, "step": 1200}, {"loss": 1.0948, "grad_norm": 0.2937372624874115, "learning_rate": 0.0002, "epoch": 0.8874220755408874, "step": 1210}, {"loss": 1.1228, "grad_norm": 0.3757196366786957, "learning_rate": 0.0002, "epoch": 0.8947561422808947, "step": 1220}, {"loss": 1.1222, "grad_norm": 0.3502705991268158, "learning_rate": 0.0002, "epoch": 0.9020902090209021, "step": 1230}, {"loss": 1.2242, "grad_norm": 0.32758915424346924, "learning_rate": 0.0002, "epoch": 0.9094242757609095, "step": 1240}, {"loss": 1.215, "grad_norm": 0.37199416756629944, "learning_rate": 0.0002, "epoch": 0.9167583425009168, "step": 1250}, {"loss": 1.1225, "grad_norm": 0.3551490604877472, "learning_rate": 0.0002, "epoch": 0.9240924092409241, "step": 1260}, {"loss": 1.1966, "grad_norm": 0.2859550714492798, "learning_rate": 0.0002, "epoch": 0.9314264759809314, "step": 1270}, {"loss": 1.2186, "grad_norm": 0.427990585565567, "learning_rate": 0.0002, "epoch": 0.9387605427209388, "step": 1280}, {"loss": 1.2848, "grad_norm": 0.33717992901802063, "learning_rate": 0.0002, "epoch": 0.9460946094609461, "step": 1290}, {"loss": 1.1656, "grad_norm": 0.30225634574890137, "learning_rate": 0.0002, "epoch": 0.9534286762009534, "step": 1300}, {"loss": 1.2404, "grad_norm": 0.385821133852005, "learning_rate": 0.0002, "epoch": 0.9607627429409608, "step": 1310}, {"loss": 1.1932, "grad_norm": 0.35278066992759705, "learning_rate": 0.0002, "epoch": 0.9680968096809681, "step": 1320}, {"loss": 1.1071, "grad_norm": 0.49987098574638367, "learning_rate": 0.0002, "epoch": 0.9754308764209755, "step": 1330}, {"loss": 1.2259, "grad_norm": 0.3842747211456299, "learning_rate": 0.0002, "epoch": 0.9827649431609827, "step": 1340}, {"loss": 1.0862, "grad_norm": 0.6274653673171997, "learning_rate": 0.0002, "epoch": 0.9900990099009901, "step": 1350}, {"loss": 1.124, "grad_norm": 0.5239808559417725, "learning_rate": 0.0002, "epoch": 0.9974330766409975, "step": 1360}]} +{"epoch": 2.0, "step": 2727, "epoch_duration": 1469.6491115093231, "total_accumulated_duration": 2942.1056683063507, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-1363", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9722, "grad_norm": 0.47521963715553284, "learning_rate": 0.0002, "epoch": 0.007334066740007334, "step": 10}, {"loss": 1.4821, "grad_norm": 0.5395162105560303, "learning_rate": 0.0002, "epoch": 0.014668133480014669, "step": 20}, {"loss": 1.4202, "grad_norm": 0.4305780231952667, "learning_rate": 0.0002, "epoch": 0.022002200220022004, "step": 30}, {"loss": 1.4271, "grad_norm": 0.6938246488571167, "learning_rate": 0.0002, "epoch": 0.029336266960029337, "step": 40}, {"loss": 1.3112, "grad_norm": 1.5133819580078125, "learning_rate": 0.0002, "epoch": 0.03667033370003667, "step": 50}, {"loss": 1.3132, "grad_norm": 0.9173883199691772, "learning_rate": 0.0002, "epoch": 0.04400440044004401, "step": 60}, {"loss": 1.2844, "grad_norm": 0.4619861841201782, "learning_rate": 0.0002, "epoch": 0.05133846718005134, "step": 70}, {"loss": 1.2108, "grad_norm": 0.46118637919425964, "learning_rate": 0.0002, "epoch": 0.058672533920058674, "step": 80}, {"loss": 1.3441, "grad_norm": 0.4468648135662079, "learning_rate": 0.0002, "epoch": 0.066006600660066, "step": 90}, {"loss": 1.1863, "grad_norm": 0.46123769879341125, "learning_rate": 0.0002, "epoch": 0.07334066740007333, "step": 100}, {"loss": 1.2772, "grad_norm": 0.4859139025211334, "learning_rate": 0.0002, "epoch": 0.08067473414008068, "step": 110}, {"loss": 1.2087, "grad_norm": 0.4384922385215759, "learning_rate": 0.0002, "epoch": 0.08800880088008801, "step": 120}, {"loss": 1.2927, "grad_norm": 0.39519360661506653, "learning_rate": 0.0002, "epoch": 0.09534286762009535, "step": 130}, {"loss": 1.2349, "grad_norm": 0.4049859344959259, "learning_rate": 0.0002, "epoch": 0.10267693436010268, "step": 140}, {"loss": 1.293, "grad_norm": 0.4605638086795807, "learning_rate": 0.0002, "epoch": 0.11001100110011001, "step": 150}, {"loss": 1.2659, "grad_norm": 0.4201928377151489, "learning_rate": 0.0002, "epoch": 0.11734506784011735, "step": 160}, {"loss": 1.3961, "grad_norm": 0.5367777347564697, "learning_rate": 0.0002, "epoch": 0.12467913458012468, "step": 170}, {"loss": 1.2481, "grad_norm": 0.41752299666404724, "learning_rate": 0.0002, "epoch": 0.132013201320132, "step": 180}, {"loss": 1.207, "grad_norm": 0.31597763299942017, "learning_rate": 0.0002, "epoch": 0.13934726806013933, "step": 190}, {"loss": 1.2441, "grad_norm": 0.7468788623809814, "learning_rate": 0.0002, "epoch": 0.14668133480014667, "step": 200}, {"loss": 1.199, "grad_norm": 0.3403034508228302, "learning_rate": 0.0002, "epoch": 0.15401540154015403, "step": 210}, {"loss": 1.2439, "grad_norm": 0.34240293502807617, "learning_rate": 0.0002, "epoch": 0.16134946828016136, "step": 220}, {"loss": 1.2022, "grad_norm": 0.356158971786499, "learning_rate": 0.0002, "epoch": 0.1686835350201687, "step": 230}, {"loss": 1.207, "grad_norm": 0.3448857367038727, "learning_rate": 0.0002, "epoch": 0.17601760176017603, "step": 240}, {"loss": 1.2156, "grad_norm": 0.3475699722766876, "learning_rate": 0.0002, "epoch": 0.18335166850018336, "step": 250}, {"loss": 1.1551, "grad_norm": 0.2770358622074127, "learning_rate": 0.0002, "epoch": 0.1906857352401907, "step": 260}, {"loss": 1.2238, "grad_norm": 0.4310270845890045, "learning_rate": 0.0002, "epoch": 0.19801980198019803, "step": 270}, {"loss": 1.2917, "grad_norm": 0.335041880607605, "learning_rate": 0.0002, "epoch": 0.20535386872020536, "step": 280}, {"loss": 1.0959, "grad_norm": 0.3420602083206177, "learning_rate": 0.0002, "epoch": 0.2126879354602127, "step": 290}, {"loss": 1.1232, "grad_norm": 0.325001060962677, "learning_rate": 0.0002, "epoch": 0.22002200220022003, "step": 300}, {"loss": 1.2007, "grad_norm": 0.3027827739715576, "learning_rate": 0.0002, "epoch": 0.22735606894022736, "step": 310}, {"loss": 1.1803, "grad_norm": 0.435550719499588, "learning_rate": 0.0002, "epoch": 0.2346901356802347, "step": 320}, {"loss": 1.2045, "grad_norm": 0.3884522616863251, "learning_rate": 0.0002, "epoch": 0.24202420242024203, "step": 330}, {"loss": 1.2481, "grad_norm": 0.7736002206802368, "learning_rate": 0.0002, "epoch": 0.24935826916024936, "step": 340}, {"loss": 1.3606, "grad_norm": 0.35052821040153503, "learning_rate": 0.0002, "epoch": 0.2566923359002567, "step": 350}, {"loss": 1.2129, "grad_norm": 0.3311890959739685, "learning_rate": 0.0002, "epoch": 0.264026402640264, "step": 360}, {"loss": 1.2219, "grad_norm": 0.7473500370979309, "learning_rate": 0.0002, "epoch": 0.27136046938027136, "step": 370}, {"loss": 1.2712, "grad_norm": 0.3681875765323639, "learning_rate": 0.0002, "epoch": 0.27869453612027867, "step": 380}, {"loss": 1.2258, "grad_norm": 0.3764737844467163, "learning_rate": 0.0002, "epoch": 0.28602860286028603, "step": 390}, {"loss": 1.1917, "grad_norm": 0.4243989586830139, "learning_rate": 0.0002, "epoch": 0.29336266960029334, "step": 400}, {"loss": 1.199, "grad_norm": 0.2658531963825226, "learning_rate": 0.0002, "epoch": 0.3006967363403007, "step": 410}, {"loss": 1.1622, "grad_norm": 0.3436793386936188, "learning_rate": 0.0002, "epoch": 0.30803080308030806, "step": 420}, {"loss": 1.2953, "grad_norm": 0.5101129412651062, "learning_rate": 0.0002, "epoch": 0.31536486982031536, "step": 430}, {"loss": 1.1557, "grad_norm": 0.3319750726222992, "learning_rate": 0.0002, "epoch": 0.3226989365603227, "step": 440}, {"loss": 1.1804, "grad_norm": 0.385148286819458, "learning_rate": 0.0002, "epoch": 0.33003300330033003, "step": 450}, {"loss": 1.1808, "grad_norm": 0.3477935791015625, "learning_rate": 0.0002, "epoch": 0.3373670700403374, "step": 460}, {"loss": 1.1877, "grad_norm": 0.29748716950416565, "learning_rate": 0.0002, "epoch": 0.3447011367803447, "step": 470}, {"loss": 1.19, "grad_norm": 0.34083324670791626, "learning_rate": 0.0002, "epoch": 0.35203520352035206, "step": 480}, {"loss": 1.2, "grad_norm": 0.36904552578926086, "learning_rate": 0.0002, "epoch": 0.35936927026035936, "step": 490}, {"loss": 1.2223, "grad_norm": 0.315483033657074, "learning_rate": 0.0002, "epoch": 0.3667033370003667, "step": 500}, {"loss": 1.1461, "grad_norm": 0.44897955656051636, "learning_rate": 0.0002, "epoch": 0.37403740374037403, "step": 510}, {"loss": 1.3035, "grad_norm": 0.3160701394081116, "learning_rate": 0.0002, "epoch": 0.3813714704803814, "step": 520}, {"loss": 1.3197, "grad_norm": 0.29584741592407227, "learning_rate": 0.0002, "epoch": 0.3887055372203887, "step": 530}, {"loss": 1.2983, "grad_norm": 0.5430002808570862, "learning_rate": 0.0002, "epoch": 0.39603960396039606, "step": 540}, {"loss": 1.2459, "grad_norm": 0.2908070683479309, "learning_rate": 0.0002, "epoch": 0.40337367070040336, "step": 550}, {"loss": 1.2384, "grad_norm": 0.35066530108451843, "learning_rate": 0.0002, "epoch": 0.4107077374404107, "step": 560}, {"loss": 1.1784, "grad_norm": 0.37588003277778625, "learning_rate": 0.0002, "epoch": 0.41804180418041803, "step": 570}, {"loss": 1.2334, "grad_norm": 0.3112126886844635, "learning_rate": 0.0002, "epoch": 0.4253758709204254, "step": 580}, {"loss": 1.1439, "grad_norm": 0.35577139258384705, "learning_rate": 0.0002, "epoch": 0.4327099376604327, "step": 590}, {"loss": 1.184, "grad_norm": 0.31706422567367554, "learning_rate": 0.0002, "epoch": 0.44004400440044006, "step": 600}, {"loss": 1.2081, "grad_norm": 0.3249092102050781, "learning_rate": 0.0002, "epoch": 0.44737807114044736, "step": 610}, {"loss": 1.0824, "grad_norm": 0.3842705488204956, "learning_rate": 0.0002, "epoch": 0.4547121378804547, "step": 620}, {"loss": 1.2257, "grad_norm": 0.390991747379303, "learning_rate": 0.0002, "epoch": 0.46204620462046203, "step": 630}, {"loss": 1.1954, "grad_norm": 0.27532413601875305, "learning_rate": 0.0002, "epoch": 0.4693802713604694, "step": 640}, {"loss": 1.1058, "grad_norm": 0.31412816047668457, "learning_rate": 0.0002, "epoch": 0.4767143381004767, "step": 650}, {"loss": 1.1312, "grad_norm": 0.32117101550102234, "learning_rate": 0.0002, "epoch": 0.48404840484048406, "step": 660}, {"loss": 1.2423, "grad_norm": 0.3810010254383087, "learning_rate": 0.0002, "epoch": 0.49138247158049136, "step": 670}, {"loss": 1.1978, "grad_norm": 0.36289164423942566, "learning_rate": 0.0002, "epoch": 0.4987165383204987, "step": 680}, {"loss": 1.2034, "grad_norm": 0.34458720684051514, "learning_rate": 0.0002, "epoch": 0.506050605060506, "step": 690}, {"loss": 1.1756, "grad_norm": 0.32844600081443787, "learning_rate": 0.0002, "epoch": 0.5133846718005134, "step": 700}, {"loss": 1.0807, "grad_norm": 0.3144175708293915, "learning_rate": 0.0002, "epoch": 0.5207187385405208, "step": 710}, {"loss": 1.1952, "grad_norm": 0.3898887634277344, "learning_rate": 0.0002, "epoch": 0.528052805280528, "step": 720}, {"loss": 1.1244, "grad_norm": 1.3220758438110352, "learning_rate": 0.0002, "epoch": 0.5353868720205354, "step": 730}, {"loss": 1.227, "grad_norm": 0.3635874390602112, "learning_rate": 0.0002, "epoch": 0.5427209387605427, "step": 740}, {"loss": 1.2169, "grad_norm": 0.3138217628002167, "learning_rate": 0.0002, "epoch": 0.5500550055005501, "step": 750}, {"loss": 1.1516, "grad_norm": 0.4063207805156708, "learning_rate": 0.0002, "epoch": 0.5573890722405573, "step": 760}, {"loss": 1.1954, "grad_norm": 0.3926219940185547, "learning_rate": 0.0002, "epoch": 0.5647231389805647, "step": 770}, {"loss": 1.1726, "grad_norm": 0.31954652070999146, "learning_rate": 0.0002, "epoch": 0.5720572057205721, "step": 780}, {"loss": 1.2977, "grad_norm": 0.4248711168766022, "learning_rate": 0.0002, "epoch": 0.5793912724605794, "step": 790}, {"loss": 1.1728, "grad_norm": 0.643004834651947, "learning_rate": 0.0002, "epoch": 0.5867253392005867, "step": 800}, {"loss": 1.1793, "grad_norm": 0.3479592800140381, "learning_rate": 0.0002, "epoch": 0.594059405940594, "step": 810}, {"loss": 1.2426, "grad_norm": 0.4684754014015198, "learning_rate": 0.0002, "epoch": 0.6013934726806014, "step": 820}, {"loss": 1.2002, "grad_norm": 0.3739790916442871, "learning_rate": 0.0002, "epoch": 0.6087275394206088, "step": 830}, {"loss": 1.2139, "grad_norm": 0.40884748101234436, "learning_rate": 0.0002, "epoch": 0.6160616061606161, "step": 840}, {"loss": 1.1557, "grad_norm": 0.9722164273262024, "learning_rate": 0.0002, "epoch": 0.6233956729006234, "step": 850}, {"loss": 1.3069, "grad_norm": 0.42992347478866577, "learning_rate": 0.0002, "epoch": 0.6307297396406307, "step": 860}, {"loss": 1.1339, "grad_norm": 0.36654195189476013, "learning_rate": 0.0002, "epoch": 0.6380638063806381, "step": 870}, {"loss": 1.1932, "grad_norm": 0.4113832116127014, "learning_rate": 0.0002, "epoch": 0.6453978731206454, "step": 880}, {"loss": 1.2163, "grad_norm": 0.2948838770389557, "learning_rate": 0.0002, "epoch": 0.6527319398606527, "step": 890}, {"loss": 1.1081, "grad_norm": 0.38330280780792236, "learning_rate": 0.0002, "epoch": 0.6600660066006601, "step": 900}, {"loss": 1.1342, "grad_norm": 0.4428867697715759, "learning_rate": 0.0002, "epoch": 0.6674000733406674, "step": 910}, {"loss": 1.1021, "grad_norm": 0.23659265041351318, "learning_rate": 0.0002, "epoch": 0.6747341400806748, "step": 920}, {"loss": 1.1226, "grad_norm": 0.323685884475708, "learning_rate": 0.0002, "epoch": 0.682068206820682, "step": 930}, {"loss": 1.0853, "grad_norm": 0.39157727360725403, "learning_rate": 0.0002, "epoch": 0.6894022735606894, "step": 940}, {"loss": 1.1435, "grad_norm": 0.27189481258392334, "learning_rate": 0.0002, "epoch": 0.6967363403006968, "step": 950}, {"loss": 1.1033, "grad_norm": 0.529883861541748, "learning_rate": 0.0002, "epoch": 0.7040704070407041, "step": 960}, {"loss": 1.139, "grad_norm": 0.34758689999580383, "learning_rate": 0.0002, "epoch": 0.7114044737807114, "step": 970}, {"loss": 1.2197, "grad_norm": 0.831749439239502, "learning_rate": 0.0002, "epoch": 0.7187385405207187, "step": 980}, {"loss": 1.158, "grad_norm": 0.4438304007053375, "learning_rate": 0.0002, "epoch": 0.7260726072607261, "step": 990}, {"loss": 1.1021, "grad_norm": 0.33840006589889526, "learning_rate": 0.0002, "epoch": 0.7334066740007334, "step": 1000}, {"loss": 1.254, "grad_norm": 0.3454797863960266, "learning_rate": 0.0002, "epoch": 0.7407407407407407, "step": 1010}, {"loss": 1.106, "grad_norm": 0.38999441266059875, "learning_rate": 0.0002, "epoch": 0.7480748074807481, "step": 1020}, {"loss": 1.1428, "grad_norm": 0.2829911708831787, "learning_rate": 0.0002, "epoch": 0.7554088742207554, "step": 1030}, {"loss": 1.2123, "grad_norm": 0.36918163299560547, "learning_rate": 0.0002, "epoch": 0.7627429409607628, "step": 1040}, {"loss": 1.3028, "grad_norm": 0.3415680229663849, "learning_rate": 0.0002, "epoch": 0.77007700770077, "step": 1050}, {"loss": 1.1939, "grad_norm": 0.2974182963371277, "learning_rate": 0.0002, "epoch": 0.7774110744407774, "step": 1060}, {"loss": 1.194, "grad_norm": 0.3880919814109802, "learning_rate": 0.0002, "epoch": 0.7847451411807848, "step": 1070}, {"loss": 1.1095, "grad_norm": 0.33503302931785583, "learning_rate": 0.0002, "epoch": 0.7920792079207921, "step": 1080}, {"loss": 1.2111, "grad_norm": 0.3728407025337219, "learning_rate": 0.0002, "epoch": 0.7994132746607994, "step": 1090}, {"loss": 1.0835, "grad_norm": 0.3509373664855957, "learning_rate": 0.0002, "epoch": 0.8067473414008067, "step": 1100}, {"loss": 1.2661, "grad_norm": 0.42228564620018005, "learning_rate": 0.0002, "epoch": 0.8140814081408141, "step": 1110}, {"loss": 1.1788, "grad_norm": 0.313467800617218, "learning_rate": 0.0002, "epoch": 0.8214154748808215, "step": 1120}, {"loss": 1.1971, "grad_norm": 0.3378850817680359, "learning_rate": 0.0002, "epoch": 0.8287495416208287, "step": 1130}, {"loss": 1.1238, "grad_norm": 0.43200382590293884, "learning_rate": 0.0002, "epoch": 0.8360836083608361, "step": 1140}, {"loss": 1.3203, "grad_norm": 0.3309599459171295, "learning_rate": 0.0002, "epoch": 0.8434176751008434, "step": 1150}, {"loss": 1.1062, "grad_norm": 0.3526846170425415, "learning_rate": 0.0002, "epoch": 0.8507517418408508, "step": 1160}, {"loss": 1.0851, "grad_norm": 1.2722247838974, "learning_rate": 0.0002, "epoch": 0.858085808580858, "step": 1170}, {"loss": 1.0785, "grad_norm": 0.34142059087753296, "learning_rate": 0.0002, "epoch": 0.8654198753208654, "step": 1180}, {"loss": 1.2187, "grad_norm": 0.3805823028087616, "learning_rate": 0.0002, "epoch": 0.8727539420608728, "step": 1190}, {"loss": 1.1215, "grad_norm": 0.3931232690811157, "learning_rate": 0.0002, "epoch": 0.8800880088008801, "step": 1200}, {"loss": 1.0948, "grad_norm": 0.2937372624874115, "learning_rate": 0.0002, "epoch": 0.8874220755408874, "step": 1210}, {"loss": 1.1228, "grad_norm": 0.3757196366786957, "learning_rate": 0.0002, "epoch": 0.8947561422808947, "step": 1220}, {"loss": 1.1222, "grad_norm": 0.3502705991268158, "learning_rate": 0.0002, "epoch": 0.9020902090209021, "step": 1230}, {"loss": 1.2242, "grad_norm": 0.32758915424346924, "learning_rate": 0.0002, "epoch": 0.9094242757609095, "step": 1240}, {"loss": 1.215, "grad_norm": 0.37199416756629944, "learning_rate": 0.0002, "epoch": 0.9167583425009168, "step": 1250}, {"loss": 1.1225, "grad_norm": 0.3551490604877472, "learning_rate": 0.0002, "epoch": 0.9240924092409241, "step": 1260}, {"loss": 1.1966, "grad_norm": 0.2859550714492798, "learning_rate": 0.0002, "epoch": 0.9314264759809314, "step": 1270}, {"loss": 1.2186, "grad_norm": 0.427990585565567, "learning_rate": 0.0002, "epoch": 0.9387605427209388, "step": 1280}, {"loss": 1.2848, "grad_norm": 0.33717992901802063, "learning_rate": 0.0002, "epoch": 0.9460946094609461, "step": 1290}, {"loss": 1.1656, "grad_norm": 0.30225634574890137, "learning_rate": 0.0002, "epoch": 0.9534286762009534, "step": 1300}, {"loss": 1.2404, "grad_norm": 0.385821133852005, "learning_rate": 0.0002, "epoch": 0.9607627429409608, "step": 1310}, {"loss": 1.1932, "grad_norm": 0.35278066992759705, "learning_rate": 0.0002, "epoch": 0.9680968096809681, "step": 1320}, {"loss": 1.1071, "grad_norm": 0.49987098574638367, "learning_rate": 0.0002, "epoch": 0.9754308764209755, "step": 1330}, {"loss": 1.2259, "grad_norm": 0.3842747211456299, "learning_rate": 0.0002, "epoch": 0.9827649431609827, "step": 1340}, {"loss": 1.0862, "grad_norm": 0.6274653673171997, "learning_rate": 0.0002, "epoch": 0.9900990099009901, "step": 1350}, {"loss": 1.124, "grad_norm": 0.5239808559417725, "learning_rate": 0.0002, "epoch": 0.9974330766409975, "step": 1360}, {"eval_loss": 1.1822267770767212, "eval_runtime": 32.7389, "eval_samples_per_second": 13.165, "eval_steps_per_second": 1.649, "epoch": 0.9996332966629996, "step": 1363}, {"loss": 1.096, "grad_norm": 0.45311301946640015, "learning_rate": 0.0002, "epoch": 1.0047671433810048, "step": 1370}, {"loss": 1.0143, "grad_norm": 0.29685574769973755, "learning_rate": 0.0002, "epoch": 1.012101210121012, "step": 1380}, {"loss": 1.0302, "grad_norm": 0.3290937840938568, "learning_rate": 0.0002, "epoch": 1.0194352768610195, "step": 1390}, {"loss": 1.0295, "grad_norm": 0.3801758587360382, "learning_rate": 0.0002, "epoch": 1.0267693436010268, "step": 1400}, {"loss": 1.1226, "grad_norm": 0.794174313545227, "learning_rate": 0.0002, "epoch": 1.034103410341034, "step": 1410}, {"loss": 1.2232, "grad_norm": 0.3854154646396637, "learning_rate": 0.0002, "epoch": 1.0414374770810415, "step": 1420}, {"loss": 1.0652, "grad_norm": 0.32702451944351196, "learning_rate": 0.0002, "epoch": 1.0487715438210488, "step": 1430}, {"loss": 1.1144, "grad_norm": 0.7815203666687012, "learning_rate": 0.0002, "epoch": 1.056105610561056, "step": 1440}, {"loss": 1.1316, "grad_norm": 0.3087436854839325, "learning_rate": 0.0002, "epoch": 1.0634396773010635, "step": 1450}, {"loss": 1.1124, "grad_norm": 0.3847602903842926, "learning_rate": 0.0002, "epoch": 1.0707737440410707, "step": 1460}, {"loss": 1.1428, "grad_norm": 0.3693031370639801, "learning_rate": 0.0002, "epoch": 1.0781078107810782, "step": 1470}, {"loss": 1.0995, "grad_norm": 0.4111202359199524, "learning_rate": 0.0002, "epoch": 1.0854418775210855, "step": 1480}, {"loss": 1.0961, "grad_norm": 0.41452381014823914, "learning_rate": 0.0002, "epoch": 1.0927759442610927, "step": 1490}, {"loss": 1.1068, "grad_norm": 0.3336445093154907, "learning_rate": 0.0002, "epoch": 1.1001100110011002, "step": 1500}, {"loss": 1.0556, "grad_norm": 0.3923407793045044, "learning_rate": 0.0002, "epoch": 1.1074440777411074, "step": 1510}, {"loss": 1.1644, "grad_norm": 0.46215683221817017, "learning_rate": 0.0002, "epoch": 1.1147781444811147, "step": 1520}, {"loss": 1.1133, "grad_norm": 0.3592156767845154, "learning_rate": 0.0002, "epoch": 1.1221122112211221, "step": 1530}, {"loss": 1.0957, "grad_norm": 0.361110657453537, "learning_rate": 0.0002, "epoch": 1.1294462779611294, "step": 1540}, {"loss": 1.1553, "grad_norm": 0.5317131280899048, "learning_rate": 0.0002, "epoch": 1.1367803447011369, "step": 1550}, {"loss": 1.0368, "grad_norm": 0.3882388174533844, "learning_rate": 0.0002, "epoch": 1.1441144114411441, "step": 1560}, {"loss": 1.0805, "grad_norm": 0.3259428143501282, "learning_rate": 0.0002, "epoch": 1.1514484781811514, "step": 1570}, {"loss": 1.1819, "grad_norm": 0.410935640335083, "learning_rate": 0.0002, "epoch": 1.1587825449211588, "step": 1580}, {"loss": 1.1143, "grad_norm": 0.44940185546875, "learning_rate": 0.0002, "epoch": 1.166116611661166, "step": 1590}, {"loss": 1.0334, "grad_norm": 0.5106484293937683, "learning_rate": 0.0002, "epoch": 1.1734506784011733, "step": 1600}, {"loss": 1.2376, "grad_norm": 0.6603665947914124, "learning_rate": 0.0002, "epoch": 1.1807847451411808, "step": 1610}, {"loss": 1.1227, "grad_norm": 0.4799964129924774, "learning_rate": 0.0002, "epoch": 1.188118811881188, "step": 1620}, {"loss": 1.1191, "grad_norm": 0.4389883279800415, "learning_rate": 0.0002, "epoch": 1.1954528786211955, "step": 1630}, {"loss": 1.0667, "grad_norm": 0.4188813269138336, "learning_rate": 0.0002, "epoch": 1.2027869453612028, "step": 1640}, {"loss": 1.0605, "grad_norm": 0.7132157683372498, "learning_rate": 0.0002, "epoch": 1.21012101210121, "step": 1650}, {"loss": 1.0204, "grad_norm": 0.507480263710022, "learning_rate": 0.0002, "epoch": 1.2174550788412175, "step": 1660}, {"loss": 0.9948, "grad_norm": 0.9452332854270935, "learning_rate": 0.0002, "epoch": 1.2247891455812248, "step": 1670}, {"loss": 1.0228, "grad_norm": 0.4121614992618561, "learning_rate": 0.0002, "epoch": 1.2321232123212322, "step": 1680}, {"loss": 1.0366, "grad_norm": 0.34230247139930725, "learning_rate": 0.0002, "epoch": 1.2394572790612395, "step": 1690}, {"loss": 1.1289, "grad_norm": 0.4026208817958832, "learning_rate": 0.0002, "epoch": 1.2467913458012467, "step": 1700}, {"loss": 1.0206, "grad_norm": 0.46673697233200073, "learning_rate": 0.0002, "epoch": 1.2541254125412542, "step": 1710}, {"loss": 1.0827, "grad_norm": 0.38349825143814087, "learning_rate": 0.0002, "epoch": 1.2614594792812615, "step": 1720}, {"loss": 1.0356, "grad_norm": 0.4049997627735138, "learning_rate": 0.0002, "epoch": 1.2687935460212687, "step": 1730}, {"loss": 0.9504, "grad_norm": 0.3417615294456482, "learning_rate": 0.0002, "epoch": 1.2761276127612762, "step": 1740}, {"loss": 1.094, "grad_norm": 0.4277614951133728, "learning_rate": 0.0002, "epoch": 1.2834616795012834, "step": 1750}, {"loss": 0.9938, "grad_norm": 0.5864202976226807, "learning_rate": 0.0002, "epoch": 1.2907957462412907, "step": 1760}, {"loss": 1.1167, "grad_norm": 0.7097493410110474, "learning_rate": 0.0002, "epoch": 1.2981298129812981, "step": 1770}, {"loss": 1.1132, "grad_norm": 0.3145381212234497, "learning_rate": 0.0002, "epoch": 1.3054638797213054, "step": 1780}, {"loss": 1.1099, "grad_norm": 0.5116165280342102, "learning_rate": 0.0002, "epoch": 1.3127979464613129, "step": 1790}, {"loss": 1.0765, "grad_norm": 0.7469736337661743, "learning_rate": 0.0002, "epoch": 1.3201320132013201, "step": 1800}, {"loss": 1.0663, "grad_norm": 0.32272255420684814, "learning_rate": 0.0002, "epoch": 1.3274660799413276, "step": 1810}, {"loss": 0.9887, "grad_norm": 0.3534623086452484, "learning_rate": 0.0002, "epoch": 1.3348001466813348, "step": 1820}, {"loss": 1.1628, "grad_norm": 0.36127907037734985, "learning_rate": 0.0002, "epoch": 1.342134213421342, "step": 1830}, {"loss": 1.0972, "grad_norm": 0.4072401523590088, "learning_rate": 0.0002, "epoch": 1.3494682801613496, "step": 1840}, {"loss": 1.1267, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.3568023469013568, "step": 1850}, {"loss": 1.0173, "grad_norm": 0.412883460521698, "learning_rate": 0.0002, "epoch": 1.364136413641364, "step": 1860}, {"loss": 1.0265, "grad_norm": 0.3735875189304352, "learning_rate": 0.0002, "epoch": 1.3714704803813715, "step": 1870}, {"loss": 1.1061, "grad_norm": 0.39158159494400024, "learning_rate": 0.0002, "epoch": 1.3788045471213788, "step": 1880}, {"loss": 1.0433, "grad_norm": 0.44431769847869873, "learning_rate": 0.0002, "epoch": 1.386138613861386, "step": 1890}, {"loss": 1.0216, "grad_norm": 0.37772801518440247, "learning_rate": 0.0002, "epoch": 1.3934726806013935, "step": 1900}, {"loss": 1.0674, "grad_norm": 0.4056641757488251, "learning_rate": 0.0002, "epoch": 1.4008067473414008, "step": 1910}, {"loss": 1.0256, "grad_norm": 0.41612377762794495, "learning_rate": 0.0002, "epoch": 1.408140814081408, "step": 1920}, {"loss": 1.0467, "grad_norm": 0.41153013706207275, "learning_rate": 0.0002, "epoch": 1.4154748808214155, "step": 1930}, {"loss": 1.1062, "grad_norm": 0.387845516204834, "learning_rate": 0.0002, "epoch": 1.4228089475614227, "step": 1940}, {"loss": 1.1094, "grad_norm": 0.3809587061405182, "learning_rate": 0.0002, "epoch": 1.4301430143014302, "step": 1950}, {"loss": 1.0461, "grad_norm": 0.3625726103782654, "learning_rate": 0.0002, "epoch": 1.4374770810414375, "step": 1960}, {"loss": 0.9983, "grad_norm": 0.5294290781021118, "learning_rate": 0.0002, "epoch": 1.444811147781445, "step": 1970}, {"loss": 1.1114, "grad_norm": 0.39975494146347046, "learning_rate": 0.0002, "epoch": 1.4521452145214522, "step": 1980}, {"loss": 0.9704, "grad_norm": 0.4181167185306549, "learning_rate": 0.0002, "epoch": 1.4594792812614594, "step": 1990}, {"loss": 1.1146, "grad_norm": 0.42001503705978394, "learning_rate": 0.0002, "epoch": 1.466813348001467, "step": 2000}, {"loss": 1.1266, "grad_norm": 0.4877578616142273, "learning_rate": 0.0002, "epoch": 1.4741474147414741, "step": 2010}, {"loss": 1.1012, "grad_norm": 0.4050969183444977, "learning_rate": 0.0002, "epoch": 1.4814814814814814, "step": 2020}, {"loss": 1.0562, "grad_norm": 0.39068883657455444, "learning_rate": 0.0002, "epoch": 1.4888155482214889, "step": 2030}, {"loss": 1.0464, "grad_norm": 0.421282559633255, "learning_rate": 0.0002, "epoch": 1.4961496149614961, "step": 2040}, {"loss": 1.0532, "grad_norm": 0.47092297673225403, "learning_rate": 0.0002, "epoch": 1.5034836817015034, "step": 2050}, {"loss": 0.9348, "grad_norm": 0.39688974618911743, "learning_rate": 0.0002, "epoch": 1.5108177484415108, "step": 2060}, {"loss": 1.08, "grad_norm": 0.5529879331588745, "learning_rate": 0.0002, "epoch": 1.5181518151815183, "step": 2070}, {"loss": 1.1836, "grad_norm": 0.4879782199859619, "learning_rate": 0.0002, "epoch": 1.5254858819215253, "step": 2080}, {"loss": 1.0432, "grad_norm": 0.5517361164093018, "learning_rate": 0.0002, "epoch": 1.5328199486615328, "step": 2090}, {"loss": 1.0433, "grad_norm": 0.44015637040138245, "learning_rate": 0.0002, "epoch": 1.5401540154015403, "step": 2100}, {"loss": 1.1873, "grad_norm": 0.5435167551040649, "learning_rate": 0.0002, "epoch": 1.5474880821415475, "step": 2110}, {"loss": 1.1076, "grad_norm": 0.5714033246040344, "learning_rate": 0.0002, "epoch": 1.5548221488815548, "step": 2120}, {"loss": 1.1107, "grad_norm": 0.31732529401779175, "learning_rate": 0.0002, "epoch": 1.5621562156215623, "step": 2130}, {"loss": 1.0817, "grad_norm": 0.49068278074264526, "learning_rate": 0.0002, "epoch": 1.5694902823615695, "step": 2140}, {"loss": 1.0254, "grad_norm": 0.46851542592048645, "learning_rate": 0.0002, "epoch": 1.5768243491015768, "step": 2150}, {"loss": 1.0623, "grad_norm": 0.5083092451095581, "learning_rate": 0.0002, "epoch": 1.5841584158415842, "step": 2160}, {"loss": 1.0603, "grad_norm": 0.9822936058044434, "learning_rate": 0.0002, "epoch": 1.5914924825815915, "step": 2170}, {"loss": 0.9986, "grad_norm": 0.4575989246368408, "learning_rate": 0.0002, "epoch": 1.5988265493215987, "step": 2180}, {"loss": 1.1292, "grad_norm": 0.47444286942481995, "learning_rate": 0.0002, "epoch": 1.6061606160616062, "step": 2190}, {"loss": 1.0136, "grad_norm": 0.7208226919174194, "learning_rate": 0.0002, "epoch": 1.6134946828016135, "step": 2200}, {"loss": 1.15, "grad_norm": 0.43791481852531433, "learning_rate": 0.0002, "epoch": 1.6208287495416207, "step": 2210}, {"loss": 1.0961, "grad_norm": 0.5245792865753174, "learning_rate": 0.0002, "epoch": 1.6281628162816282, "step": 2220}, {"loss": 0.9957, "grad_norm": 0.39289429783821106, "learning_rate": 0.0002, "epoch": 1.6354968830216357, "step": 2230}, {"loss": 1.133, "grad_norm": 0.6106135845184326, "learning_rate": 0.0002, "epoch": 1.6428309497616427, "step": 2240}, {"loss": 1.0129, "grad_norm": 0.3722580671310425, "learning_rate": 0.0002, "epoch": 1.6501650165016502, "step": 2250}, {"loss": 1.0446, "grad_norm": 0.3649403750896454, "learning_rate": 0.0002, "epoch": 1.6574990832416576, "step": 2260}, {"loss": 1.0037, "grad_norm": 0.46514248847961426, "learning_rate": 0.0002, "epoch": 1.6648331499816649, "step": 2270}, {"loss": 1.0022, "grad_norm": 0.42034927010536194, "learning_rate": 0.0002, "epoch": 1.6721672167216721, "step": 2280}, {"loss": 1.1362, "grad_norm": 0.45202910900115967, "learning_rate": 0.0002, "epoch": 1.6795012834616796, "step": 2290}, {"loss": 1.0866, "grad_norm": 0.36257603764533997, "learning_rate": 0.0002, "epoch": 1.6868353502016868, "step": 2300}, {"loss": 1.0973, "grad_norm": 0.6340323090553284, "learning_rate": 0.0002, "epoch": 1.694169416941694, "step": 2310}, {"loss": 1.0615, "grad_norm": 0.4352878928184509, "learning_rate": 0.0002, "epoch": 1.7015034836817016, "step": 2320}, {"loss": 1.0629, "grad_norm": 0.45029792189598083, "learning_rate": 0.0002, "epoch": 1.7088375504217088, "step": 2330}, {"loss": 0.9621, "grad_norm": 0.3891315758228302, "learning_rate": 0.0002, "epoch": 1.716171617161716, "step": 2340}, {"loss": 0.9779, "grad_norm": 0.35180050134658813, "learning_rate": 0.0002, "epoch": 1.7235056839017235, "step": 2350}, {"loss": 1.0368, "grad_norm": 0.42367449402809143, "learning_rate": 0.0002, "epoch": 1.7308397506417308, "step": 2360}, {"loss": 1.0376, "grad_norm": 0.4553675353527069, "learning_rate": 0.0002, "epoch": 1.738173817381738, "step": 2370}, {"loss": 1.1467, "grad_norm": 0.5944654941558838, "learning_rate": 0.0002, "epoch": 1.7455078841217455, "step": 2380}, {"loss": 1.0548, "grad_norm": 0.3479664623737335, "learning_rate": 0.0002, "epoch": 1.752841950861753, "step": 2390}, {"loss": 1.0798, "grad_norm": 0.3585502505302429, "learning_rate": 0.0002, "epoch": 1.76017601760176, "step": 2400}, {"loss": 1.0983, "grad_norm": 0.4263346493244171, "learning_rate": 0.0002, "epoch": 1.7675100843417675, "step": 2410}, {"loss": 1.054, "grad_norm": 0.5476409196853638, "learning_rate": 0.0002, "epoch": 1.774844151081775, "step": 2420}, {"loss": 1.1615, "grad_norm": 0.3694186508655548, "learning_rate": 0.0002, "epoch": 1.7821782178217822, "step": 2430}, {"loss": 1.1343, "grad_norm": 0.9185658693313599, "learning_rate": 0.0002, "epoch": 1.7895122845617895, "step": 2440}, {"loss": 1.0764, "grad_norm": 0.7171908020973206, "learning_rate": 0.0002, "epoch": 1.796846351301797, "step": 2450}, {"loss": 1.1154, "grad_norm": 0.550658643245697, "learning_rate": 0.0002, "epoch": 1.8041804180418042, "step": 2460}, {"loss": 0.9975, "grad_norm": 0.4075568914413452, "learning_rate": 0.0002, "epoch": 1.8115144847818114, "step": 2470}, {"loss": 1.0935, "grad_norm": 0.3790127635002136, "learning_rate": 0.0002, "epoch": 1.818848551521819, "step": 2480}, {"loss": 0.9839, "grad_norm": 0.3576384484767914, "learning_rate": 0.0002, "epoch": 1.8261826182618262, "step": 2490}, {"loss": 1.1369, "grad_norm": 0.3919370770454407, "learning_rate": 0.0002, "epoch": 1.8335166850018334, "step": 2500}, {"loss": 0.9985, "grad_norm": 0.485083669424057, "learning_rate": 0.0002, "epoch": 1.8408507517418409, "step": 2510}, {"loss": 1.1585, "grad_norm": 0.4564347565174103, "learning_rate": 0.0002, "epoch": 1.8481848184818483, "step": 2520}, {"loss": 1.0944, "grad_norm": 0.3613106608390808, "learning_rate": 0.0002, "epoch": 1.8555188852218554, "step": 2530}, {"loss": 1.0819, "grad_norm": 0.39600759744644165, "learning_rate": 0.0002, "epoch": 1.8628529519618628, "step": 2540}, {"loss": 0.9453, "grad_norm": 1.123499870300293, "learning_rate": 0.0002, "epoch": 1.8701870187018703, "step": 2550}, {"loss": 1.0635, "grad_norm": 0.4612680673599243, "learning_rate": 0.0002, "epoch": 1.8775210854418776, "step": 2560}, {"loss": 1.0087, "grad_norm": 0.42745399475097656, "learning_rate": 0.0002, "epoch": 1.8848551521818848, "step": 2570}, {"loss": 1.0102, "grad_norm": 0.4055580198764801, "learning_rate": 0.0002, "epoch": 1.8921892189218923, "step": 2580}, {"loss": 1.0177, "grad_norm": 0.44174644351005554, "learning_rate": 0.0002, "epoch": 1.8995232856618995, "step": 2590}, {"loss": 0.9886, "grad_norm": 1.0228385925292969, "learning_rate": 0.0002, "epoch": 1.9068573524019068, "step": 2600}, {"loss": 1.0857, "grad_norm": 0.3496396243572235, "learning_rate": 0.0002, "epoch": 1.9141914191419143, "step": 2610}, {"loss": 1.0955, "grad_norm": 0.4191173017024994, "learning_rate": 0.0002, "epoch": 1.9215254858819215, "step": 2620}, {"loss": 1.0943, "grad_norm": 0.6778554916381836, "learning_rate": 0.0002, "epoch": 1.9288595526219288, "step": 2630}, {"loss": 1.0594, "grad_norm": 0.41992834210395813, "learning_rate": 0.0002, "epoch": 1.9361936193619362, "step": 2640}, {"loss": 1.1159, "grad_norm": 0.8760401010513306, "learning_rate": 0.0002, "epoch": 1.9435276861019435, "step": 2650}, {"loss": 1.0379, "grad_norm": 0.44049209356307983, "learning_rate": 0.0002, "epoch": 1.9508617528419507, "step": 2660}, {"loss": 1.1008, "grad_norm": 0.5651928782463074, "learning_rate": 0.0002, "epoch": 1.9581958195819582, "step": 2670}, {"loss": 1.1317, "grad_norm": 0.5292727947235107, "learning_rate": 0.0002, "epoch": 1.9655298863219657, "step": 2680}, {"loss": 1.1328, "grad_norm": 0.6012240648269653, "learning_rate": 0.0002, "epoch": 1.9728639530619727, "step": 2690}, {"loss": 1.0683, "grad_norm": 0.3945149779319763, "learning_rate": 0.0002, "epoch": 1.9801980198019802, "step": 2700}, {"loss": 1.0155, "grad_norm": 0.5732627511024475, "learning_rate": 0.0002, "epoch": 1.9875320865419877, "step": 2710}, {"loss": 0.9857, "grad_norm": 0.3963361084461212, "learning_rate": 0.0002, "epoch": 1.994866153281995, "step": 2720}]} +{"epoch": 2.9996332966629997, "step": 4090, "epoch_duration": 1492.1944043636322, "total_accumulated_duration": 4434.300072669983, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9722, "grad_norm": 0.47521963715553284, "learning_rate": 0.0002, "epoch": 0.007334066740007334, "step": 10}, {"loss": 1.4821, "grad_norm": 0.5395162105560303, "learning_rate": 0.0002, "epoch": 0.014668133480014669, "step": 20}, {"loss": 1.4202, "grad_norm": 0.4305780231952667, "learning_rate": 0.0002, "epoch": 0.022002200220022004, "step": 30}, {"loss": 1.4271, "grad_norm": 0.6938246488571167, "learning_rate": 0.0002, "epoch": 0.029336266960029337, "step": 40}, {"loss": 1.3112, "grad_norm": 1.5133819580078125, "learning_rate": 0.0002, "epoch": 0.03667033370003667, "step": 50}, {"loss": 1.3132, "grad_norm": 0.9173883199691772, "learning_rate": 0.0002, "epoch": 0.04400440044004401, "step": 60}, {"loss": 1.2844, "grad_norm": 0.4619861841201782, "learning_rate": 0.0002, "epoch": 0.05133846718005134, "step": 70}, {"loss": 1.2108, "grad_norm": 0.46118637919425964, "learning_rate": 0.0002, "epoch": 0.058672533920058674, "step": 80}, {"loss": 1.3441, "grad_norm": 0.4468648135662079, "learning_rate": 0.0002, "epoch": 0.066006600660066, "step": 90}, {"loss": 1.1863, "grad_norm": 0.46123769879341125, "learning_rate": 0.0002, "epoch": 0.07334066740007333, "step": 100}, {"loss": 1.2772, "grad_norm": 0.4859139025211334, "learning_rate": 0.0002, "epoch": 0.08067473414008068, "step": 110}, {"loss": 1.2087, "grad_norm": 0.4384922385215759, "learning_rate": 0.0002, "epoch": 0.08800880088008801, "step": 120}, {"loss": 1.2927, "grad_norm": 0.39519360661506653, "learning_rate": 0.0002, "epoch": 0.09534286762009535, "step": 130}, {"loss": 1.2349, "grad_norm": 0.4049859344959259, "learning_rate": 0.0002, "epoch": 0.10267693436010268, "step": 140}, {"loss": 1.293, "grad_norm": 0.4605638086795807, "learning_rate": 0.0002, "epoch": 0.11001100110011001, "step": 150}, {"loss": 1.2659, "grad_norm": 0.4201928377151489, "learning_rate": 0.0002, "epoch": 0.11734506784011735, "step": 160}, {"loss": 1.3961, "grad_norm": 0.5367777347564697, "learning_rate": 0.0002, "epoch": 0.12467913458012468, "step": 170}, {"loss": 1.2481, "grad_norm": 0.41752299666404724, "learning_rate": 0.0002, "epoch": 0.132013201320132, "step": 180}, {"loss": 1.207, "grad_norm": 0.31597763299942017, "learning_rate": 0.0002, "epoch": 0.13934726806013933, "step": 190}, {"loss": 1.2441, "grad_norm": 0.7468788623809814, "learning_rate": 0.0002, "epoch": 0.14668133480014667, "step": 200}, {"loss": 1.199, "grad_norm": 0.3403034508228302, "learning_rate": 0.0002, "epoch": 0.15401540154015403, "step": 210}, {"loss": 1.2439, "grad_norm": 0.34240293502807617, "learning_rate": 0.0002, "epoch": 0.16134946828016136, "step": 220}, {"loss": 1.2022, "grad_norm": 0.356158971786499, "learning_rate": 0.0002, "epoch": 0.1686835350201687, "step": 230}, {"loss": 1.207, "grad_norm": 0.3448857367038727, "learning_rate": 0.0002, "epoch": 0.17601760176017603, "step": 240}, {"loss": 1.2156, "grad_norm": 0.3475699722766876, "learning_rate": 0.0002, "epoch": 0.18335166850018336, "step": 250}, {"loss": 1.1551, "grad_norm": 0.2770358622074127, "learning_rate": 0.0002, "epoch": 0.1906857352401907, "step": 260}, {"loss": 1.2238, "grad_norm": 0.4310270845890045, "learning_rate": 0.0002, "epoch": 0.19801980198019803, "step": 270}, {"loss": 1.2917, "grad_norm": 0.335041880607605, "learning_rate": 0.0002, "epoch": 0.20535386872020536, "step": 280}, {"loss": 1.0959, "grad_norm": 0.3420602083206177, "learning_rate": 0.0002, "epoch": 0.2126879354602127, "step": 290}, {"loss": 1.1232, "grad_norm": 0.325001060962677, "learning_rate": 0.0002, "epoch": 0.22002200220022003, "step": 300}, {"loss": 1.2007, "grad_norm": 0.3027827739715576, "learning_rate": 0.0002, "epoch": 0.22735606894022736, "step": 310}, {"loss": 1.1803, "grad_norm": 0.435550719499588, "learning_rate": 0.0002, "epoch": 0.2346901356802347, "step": 320}, {"loss": 1.2045, "grad_norm": 0.3884522616863251, "learning_rate": 0.0002, "epoch": 0.24202420242024203, "step": 330}, {"loss": 1.2481, "grad_norm": 0.7736002206802368, "learning_rate": 0.0002, "epoch": 0.24935826916024936, "step": 340}, {"loss": 1.3606, "grad_norm": 0.35052821040153503, "learning_rate": 0.0002, "epoch": 0.2566923359002567, "step": 350}, {"loss": 1.2129, "grad_norm": 0.3311890959739685, "learning_rate": 0.0002, "epoch": 0.264026402640264, "step": 360}, {"loss": 1.2219, "grad_norm": 0.7473500370979309, "learning_rate": 0.0002, "epoch": 0.27136046938027136, "step": 370}, {"loss": 1.2712, "grad_norm": 0.3681875765323639, "learning_rate": 0.0002, "epoch": 0.27869453612027867, "step": 380}, {"loss": 1.2258, "grad_norm": 0.3764737844467163, "learning_rate": 0.0002, "epoch": 0.28602860286028603, "step": 390}, {"loss": 1.1917, "grad_norm": 0.4243989586830139, "learning_rate": 0.0002, "epoch": 0.29336266960029334, "step": 400}, {"loss": 1.199, "grad_norm": 0.2658531963825226, "learning_rate": 0.0002, "epoch": 0.3006967363403007, "step": 410}, {"loss": 1.1622, "grad_norm": 0.3436793386936188, "learning_rate": 0.0002, "epoch": 0.30803080308030806, "step": 420}, {"loss": 1.2953, "grad_norm": 0.5101129412651062, "learning_rate": 0.0002, "epoch": 0.31536486982031536, "step": 430}, {"loss": 1.1557, "grad_norm": 0.3319750726222992, "learning_rate": 0.0002, "epoch": 0.3226989365603227, "step": 440}, {"loss": 1.1804, "grad_norm": 0.385148286819458, "learning_rate": 0.0002, "epoch": 0.33003300330033003, "step": 450}, {"loss": 1.1808, "grad_norm": 0.3477935791015625, "learning_rate": 0.0002, "epoch": 0.3373670700403374, "step": 460}, {"loss": 1.1877, "grad_norm": 0.29748716950416565, "learning_rate": 0.0002, "epoch": 0.3447011367803447, "step": 470}, {"loss": 1.19, "grad_norm": 0.34083324670791626, "learning_rate": 0.0002, "epoch": 0.35203520352035206, "step": 480}, {"loss": 1.2, "grad_norm": 0.36904552578926086, "learning_rate": 0.0002, "epoch": 0.35936927026035936, "step": 490}, {"loss": 1.2223, "grad_norm": 0.315483033657074, "learning_rate": 0.0002, "epoch": 0.3667033370003667, "step": 500}, {"loss": 1.1461, "grad_norm": 0.44897955656051636, "learning_rate": 0.0002, "epoch": 0.37403740374037403, "step": 510}, {"loss": 1.3035, "grad_norm": 0.3160701394081116, "learning_rate": 0.0002, "epoch": 0.3813714704803814, "step": 520}, {"loss": 1.3197, "grad_norm": 0.29584741592407227, "learning_rate": 0.0002, "epoch": 0.3887055372203887, "step": 530}, {"loss": 1.2983, "grad_norm": 0.5430002808570862, "learning_rate": 0.0002, "epoch": 0.39603960396039606, "step": 540}, {"loss": 1.2459, "grad_norm": 0.2908070683479309, "learning_rate": 0.0002, "epoch": 0.40337367070040336, "step": 550}, {"loss": 1.2384, "grad_norm": 0.35066530108451843, "learning_rate": 0.0002, "epoch": 0.4107077374404107, "step": 560}, {"loss": 1.1784, "grad_norm": 0.37588003277778625, "learning_rate": 0.0002, "epoch": 0.41804180418041803, "step": 570}, {"loss": 1.2334, "grad_norm": 0.3112126886844635, "learning_rate": 0.0002, "epoch": 0.4253758709204254, "step": 580}, {"loss": 1.1439, "grad_norm": 0.35577139258384705, "learning_rate": 0.0002, "epoch": 0.4327099376604327, "step": 590}, {"loss": 1.184, "grad_norm": 0.31706422567367554, "learning_rate": 0.0002, "epoch": 0.44004400440044006, "step": 600}, {"loss": 1.2081, "grad_norm": 0.3249092102050781, "learning_rate": 0.0002, "epoch": 0.44737807114044736, "step": 610}, {"loss": 1.0824, "grad_norm": 0.3842705488204956, "learning_rate": 0.0002, "epoch": 0.4547121378804547, "step": 620}, {"loss": 1.2257, "grad_norm": 0.390991747379303, "learning_rate": 0.0002, "epoch": 0.46204620462046203, "step": 630}, {"loss": 1.1954, "grad_norm": 0.27532413601875305, "learning_rate": 0.0002, "epoch": 0.4693802713604694, "step": 640}, {"loss": 1.1058, "grad_norm": 0.31412816047668457, "learning_rate": 0.0002, "epoch": 0.4767143381004767, "step": 650}, {"loss": 1.1312, "grad_norm": 0.32117101550102234, "learning_rate": 0.0002, "epoch": 0.48404840484048406, "step": 660}, {"loss": 1.2423, "grad_norm": 0.3810010254383087, "learning_rate": 0.0002, "epoch": 0.49138247158049136, "step": 670}, {"loss": 1.1978, "grad_norm": 0.36289164423942566, "learning_rate": 0.0002, "epoch": 0.4987165383204987, "step": 680}, {"loss": 1.2034, "grad_norm": 0.34458720684051514, "learning_rate": 0.0002, "epoch": 0.506050605060506, "step": 690}, {"loss": 1.1756, "grad_norm": 0.32844600081443787, "learning_rate": 0.0002, "epoch": 0.5133846718005134, "step": 700}, {"loss": 1.0807, "grad_norm": 0.3144175708293915, "learning_rate": 0.0002, "epoch": 0.5207187385405208, "step": 710}, {"loss": 1.1952, "grad_norm": 0.3898887634277344, "learning_rate": 0.0002, "epoch": 0.528052805280528, "step": 720}, {"loss": 1.1244, "grad_norm": 1.3220758438110352, "learning_rate": 0.0002, "epoch": 0.5353868720205354, "step": 730}, {"loss": 1.227, "grad_norm": 0.3635874390602112, "learning_rate": 0.0002, "epoch": 0.5427209387605427, "step": 740}, {"loss": 1.2169, "grad_norm": 0.3138217628002167, "learning_rate": 0.0002, "epoch": 0.5500550055005501, "step": 750}, {"loss": 1.1516, "grad_norm": 0.4063207805156708, "learning_rate": 0.0002, "epoch": 0.5573890722405573, "step": 760}, {"loss": 1.1954, "grad_norm": 0.3926219940185547, "learning_rate": 0.0002, "epoch": 0.5647231389805647, "step": 770}, {"loss": 1.1726, "grad_norm": 0.31954652070999146, "learning_rate": 0.0002, "epoch": 0.5720572057205721, "step": 780}, {"loss": 1.2977, "grad_norm": 0.4248711168766022, "learning_rate": 0.0002, "epoch": 0.5793912724605794, "step": 790}, {"loss": 1.1728, "grad_norm": 0.643004834651947, "learning_rate": 0.0002, "epoch": 0.5867253392005867, "step": 800}, {"loss": 1.1793, "grad_norm": 0.3479592800140381, "learning_rate": 0.0002, "epoch": 0.594059405940594, "step": 810}, {"loss": 1.2426, "grad_norm": 0.4684754014015198, "learning_rate": 0.0002, "epoch": 0.6013934726806014, "step": 820}, {"loss": 1.2002, "grad_norm": 0.3739790916442871, "learning_rate": 0.0002, "epoch": 0.6087275394206088, "step": 830}, {"loss": 1.2139, "grad_norm": 0.40884748101234436, "learning_rate": 0.0002, "epoch": 0.6160616061606161, "step": 840}, {"loss": 1.1557, "grad_norm": 0.9722164273262024, "learning_rate": 0.0002, "epoch": 0.6233956729006234, "step": 850}, {"loss": 1.3069, "grad_norm": 0.42992347478866577, "learning_rate": 0.0002, "epoch": 0.6307297396406307, "step": 860}, {"loss": 1.1339, "grad_norm": 0.36654195189476013, "learning_rate": 0.0002, "epoch": 0.6380638063806381, "step": 870}, {"loss": 1.1932, "grad_norm": 0.4113832116127014, "learning_rate": 0.0002, "epoch": 0.6453978731206454, "step": 880}, {"loss": 1.2163, "grad_norm": 0.2948838770389557, "learning_rate": 0.0002, "epoch": 0.6527319398606527, "step": 890}, {"loss": 1.1081, "grad_norm": 0.38330280780792236, "learning_rate": 0.0002, "epoch": 0.6600660066006601, "step": 900}, {"loss": 1.1342, "grad_norm": 0.4428867697715759, "learning_rate": 0.0002, "epoch": 0.6674000733406674, "step": 910}, {"loss": 1.1021, "grad_norm": 0.23659265041351318, "learning_rate": 0.0002, "epoch": 0.6747341400806748, "step": 920}, {"loss": 1.1226, "grad_norm": 0.323685884475708, "learning_rate": 0.0002, "epoch": 0.682068206820682, "step": 930}, {"loss": 1.0853, "grad_norm": 0.39157727360725403, "learning_rate": 0.0002, "epoch": 0.6894022735606894, "step": 940}, {"loss": 1.1435, "grad_norm": 0.27189481258392334, "learning_rate": 0.0002, "epoch": 0.6967363403006968, "step": 950}, {"loss": 1.1033, "grad_norm": 0.529883861541748, "learning_rate": 0.0002, "epoch": 0.7040704070407041, "step": 960}, {"loss": 1.139, "grad_norm": 0.34758689999580383, "learning_rate": 0.0002, "epoch": 0.7114044737807114, "step": 970}, {"loss": 1.2197, "grad_norm": 0.831749439239502, "learning_rate": 0.0002, "epoch": 0.7187385405207187, "step": 980}, {"loss": 1.158, "grad_norm": 0.4438304007053375, "learning_rate": 0.0002, "epoch": 0.7260726072607261, "step": 990}, {"loss": 1.1021, "grad_norm": 0.33840006589889526, "learning_rate": 0.0002, "epoch": 0.7334066740007334, "step": 1000}, {"loss": 1.254, "grad_norm": 0.3454797863960266, "learning_rate": 0.0002, "epoch": 0.7407407407407407, "step": 1010}, {"loss": 1.106, "grad_norm": 0.38999441266059875, "learning_rate": 0.0002, "epoch": 0.7480748074807481, "step": 1020}, {"loss": 1.1428, "grad_norm": 0.2829911708831787, "learning_rate": 0.0002, "epoch": 0.7554088742207554, "step": 1030}, {"loss": 1.2123, "grad_norm": 0.36918163299560547, "learning_rate": 0.0002, "epoch": 0.7627429409607628, "step": 1040}, {"loss": 1.3028, "grad_norm": 0.3415680229663849, "learning_rate": 0.0002, "epoch": 0.77007700770077, "step": 1050}, {"loss": 1.1939, "grad_norm": 0.2974182963371277, "learning_rate": 0.0002, "epoch": 0.7774110744407774, "step": 1060}, {"loss": 1.194, "grad_norm": 0.3880919814109802, "learning_rate": 0.0002, "epoch": 0.7847451411807848, "step": 1070}, {"loss": 1.1095, "grad_norm": 0.33503302931785583, "learning_rate": 0.0002, "epoch": 0.7920792079207921, "step": 1080}, {"loss": 1.2111, "grad_norm": 0.3728407025337219, "learning_rate": 0.0002, "epoch": 0.7994132746607994, "step": 1090}, {"loss": 1.0835, "grad_norm": 0.3509373664855957, "learning_rate": 0.0002, "epoch": 0.8067473414008067, "step": 1100}, {"loss": 1.2661, "grad_norm": 0.42228564620018005, "learning_rate": 0.0002, "epoch": 0.8140814081408141, "step": 1110}, {"loss": 1.1788, "grad_norm": 0.313467800617218, "learning_rate": 0.0002, "epoch": 0.8214154748808215, "step": 1120}, {"loss": 1.1971, "grad_norm": 0.3378850817680359, "learning_rate": 0.0002, "epoch": 0.8287495416208287, "step": 1130}, {"loss": 1.1238, "grad_norm": 0.43200382590293884, "learning_rate": 0.0002, "epoch": 0.8360836083608361, "step": 1140}, {"loss": 1.3203, "grad_norm": 0.3309599459171295, "learning_rate": 0.0002, "epoch": 0.8434176751008434, "step": 1150}, {"loss": 1.1062, "grad_norm": 0.3526846170425415, "learning_rate": 0.0002, "epoch": 0.8507517418408508, "step": 1160}, {"loss": 1.0851, "grad_norm": 1.2722247838974, "learning_rate": 0.0002, "epoch": 0.858085808580858, "step": 1170}, {"loss": 1.0785, "grad_norm": 0.34142059087753296, "learning_rate": 0.0002, "epoch": 0.8654198753208654, "step": 1180}, {"loss": 1.2187, "grad_norm": 0.3805823028087616, "learning_rate": 0.0002, "epoch": 0.8727539420608728, "step": 1190}, {"loss": 1.1215, "grad_norm": 0.3931232690811157, "learning_rate": 0.0002, "epoch": 0.8800880088008801, "step": 1200}, {"loss": 1.0948, "grad_norm": 0.2937372624874115, "learning_rate": 0.0002, "epoch": 0.8874220755408874, "step": 1210}, {"loss": 1.1228, "grad_norm": 0.3757196366786957, "learning_rate": 0.0002, "epoch": 0.8947561422808947, "step": 1220}, {"loss": 1.1222, "grad_norm": 0.3502705991268158, "learning_rate": 0.0002, "epoch": 0.9020902090209021, "step": 1230}, {"loss": 1.2242, "grad_norm": 0.32758915424346924, "learning_rate": 0.0002, "epoch": 0.9094242757609095, "step": 1240}, {"loss": 1.215, "grad_norm": 0.37199416756629944, "learning_rate": 0.0002, "epoch": 0.9167583425009168, "step": 1250}, {"loss": 1.1225, "grad_norm": 0.3551490604877472, "learning_rate": 0.0002, "epoch": 0.9240924092409241, "step": 1260}, {"loss": 1.1966, "grad_norm": 0.2859550714492798, "learning_rate": 0.0002, "epoch": 0.9314264759809314, "step": 1270}, {"loss": 1.2186, "grad_norm": 0.427990585565567, "learning_rate": 0.0002, "epoch": 0.9387605427209388, "step": 1280}, {"loss": 1.2848, "grad_norm": 0.33717992901802063, "learning_rate": 0.0002, "epoch": 0.9460946094609461, "step": 1290}, {"loss": 1.1656, "grad_norm": 0.30225634574890137, "learning_rate": 0.0002, "epoch": 0.9534286762009534, "step": 1300}, {"loss": 1.2404, "grad_norm": 0.385821133852005, "learning_rate": 0.0002, "epoch": 0.9607627429409608, "step": 1310}, {"loss": 1.1932, "grad_norm": 0.35278066992759705, "learning_rate": 0.0002, "epoch": 0.9680968096809681, "step": 1320}, {"loss": 1.1071, "grad_norm": 0.49987098574638367, "learning_rate": 0.0002, "epoch": 0.9754308764209755, "step": 1330}, {"loss": 1.2259, "grad_norm": 0.3842747211456299, "learning_rate": 0.0002, "epoch": 0.9827649431609827, "step": 1340}, {"loss": 1.0862, "grad_norm": 0.6274653673171997, "learning_rate": 0.0002, "epoch": 0.9900990099009901, "step": 1350}, {"loss": 1.124, "grad_norm": 0.5239808559417725, "learning_rate": 0.0002, "epoch": 0.9974330766409975, "step": 1360}, {"eval_loss": 1.1822267770767212, "eval_runtime": 32.7389, "eval_samples_per_second": 13.165, "eval_steps_per_second": 1.649, "epoch": 0.9996332966629996, "step": 1363}, {"loss": 1.096, "grad_norm": 0.45311301946640015, "learning_rate": 0.0002, "epoch": 1.0047671433810048, "step": 1370}, {"loss": 1.0143, "grad_norm": 0.29685574769973755, "learning_rate": 0.0002, "epoch": 1.012101210121012, "step": 1380}, {"loss": 1.0302, "grad_norm": 0.3290937840938568, "learning_rate": 0.0002, "epoch": 1.0194352768610195, "step": 1390}, {"loss": 1.0295, "grad_norm": 0.3801758587360382, "learning_rate": 0.0002, "epoch": 1.0267693436010268, "step": 1400}, {"loss": 1.1226, "grad_norm": 0.794174313545227, "learning_rate": 0.0002, "epoch": 1.034103410341034, "step": 1410}, {"loss": 1.2232, "grad_norm": 0.3854154646396637, "learning_rate": 0.0002, "epoch": 1.0414374770810415, "step": 1420}, {"loss": 1.0652, "grad_norm": 0.32702451944351196, "learning_rate": 0.0002, "epoch": 1.0487715438210488, "step": 1430}, {"loss": 1.1144, "grad_norm": 0.7815203666687012, "learning_rate": 0.0002, "epoch": 1.056105610561056, "step": 1440}, {"loss": 1.1316, "grad_norm": 0.3087436854839325, "learning_rate": 0.0002, "epoch": 1.0634396773010635, "step": 1450}, {"loss": 1.1124, "grad_norm": 0.3847602903842926, "learning_rate": 0.0002, "epoch": 1.0707737440410707, "step": 1460}, {"loss": 1.1428, "grad_norm": 0.3693031370639801, "learning_rate": 0.0002, "epoch": 1.0781078107810782, "step": 1470}, {"loss": 1.0995, "grad_norm": 0.4111202359199524, "learning_rate": 0.0002, "epoch": 1.0854418775210855, "step": 1480}, {"loss": 1.0961, "grad_norm": 0.41452381014823914, "learning_rate": 0.0002, "epoch": 1.0927759442610927, "step": 1490}, {"loss": 1.1068, "grad_norm": 0.3336445093154907, "learning_rate": 0.0002, "epoch": 1.1001100110011002, "step": 1500}, {"loss": 1.0556, "grad_norm": 0.3923407793045044, "learning_rate": 0.0002, "epoch": 1.1074440777411074, "step": 1510}, {"loss": 1.1644, "grad_norm": 0.46215683221817017, "learning_rate": 0.0002, "epoch": 1.1147781444811147, "step": 1520}, {"loss": 1.1133, "grad_norm": 0.3592156767845154, "learning_rate": 0.0002, "epoch": 1.1221122112211221, "step": 1530}, {"loss": 1.0957, "grad_norm": 0.361110657453537, "learning_rate": 0.0002, "epoch": 1.1294462779611294, "step": 1540}, {"loss": 1.1553, "grad_norm": 0.5317131280899048, "learning_rate": 0.0002, "epoch": 1.1367803447011369, "step": 1550}, {"loss": 1.0368, "grad_norm": 0.3882388174533844, "learning_rate": 0.0002, "epoch": 1.1441144114411441, "step": 1560}, {"loss": 1.0805, "grad_norm": 0.3259428143501282, "learning_rate": 0.0002, "epoch": 1.1514484781811514, "step": 1570}, {"loss": 1.1819, "grad_norm": 0.410935640335083, "learning_rate": 0.0002, "epoch": 1.1587825449211588, "step": 1580}, {"loss": 1.1143, "grad_norm": 0.44940185546875, "learning_rate": 0.0002, "epoch": 1.166116611661166, "step": 1590}, {"loss": 1.0334, "grad_norm": 0.5106484293937683, "learning_rate": 0.0002, "epoch": 1.1734506784011733, "step": 1600}, {"loss": 1.2376, "grad_norm": 0.6603665947914124, "learning_rate": 0.0002, "epoch": 1.1807847451411808, "step": 1610}, {"loss": 1.1227, "grad_norm": 0.4799964129924774, "learning_rate": 0.0002, "epoch": 1.188118811881188, "step": 1620}, {"loss": 1.1191, "grad_norm": 0.4389883279800415, "learning_rate": 0.0002, "epoch": 1.1954528786211955, "step": 1630}, {"loss": 1.0667, "grad_norm": 0.4188813269138336, "learning_rate": 0.0002, "epoch": 1.2027869453612028, "step": 1640}, {"loss": 1.0605, "grad_norm": 0.7132157683372498, "learning_rate": 0.0002, "epoch": 1.21012101210121, "step": 1650}, {"loss": 1.0204, "grad_norm": 0.507480263710022, "learning_rate": 0.0002, "epoch": 1.2174550788412175, "step": 1660}, {"loss": 0.9948, "grad_norm": 0.9452332854270935, "learning_rate": 0.0002, "epoch": 1.2247891455812248, "step": 1670}, {"loss": 1.0228, "grad_norm": 0.4121614992618561, "learning_rate": 0.0002, "epoch": 1.2321232123212322, "step": 1680}, {"loss": 1.0366, "grad_norm": 0.34230247139930725, "learning_rate": 0.0002, "epoch": 1.2394572790612395, "step": 1690}, {"loss": 1.1289, "grad_norm": 0.4026208817958832, "learning_rate": 0.0002, "epoch": 1.2467913458012467, "step": 1700}, {"loss": 1.0206, "grad_norm": 0.46673697233200073, "learning_rate": 0.0002, "epoch": 1.2541254125412542, "step": 1710}, {"loss": 1.0827, "grad_norm": 0.38349825143814087, "learning_rate": 0.0002, "epoch": 1.2614594792812615, "step": 1720}, {"loss": 1.0356, "grad_norm": 0.4049997627735138, "learning_rate": 0.0002, "epoch": 1.2687935460212687, "step": 1730}, {"loss": 0.9504, "grad_norm": 0.3417615294456482, "learning_rate": 0.0002, "epoch": 1.2761276127612762, "step": 1740}, {"loss": 1.094, "grad_norm": 0.4277614951133728, "learning_rate": 0.0002, "epoch": 1.2834616795012834, "step": 1750}, {"loss": 0.9938, "grad_norm": 0.5864202976226807, "learning_rate": 0.0002, "epoch": 1.2907957462412907, "step": 1760}, {"loss": 1.1167, "grad_norm": 0.7097493410110474, "learning_rate": 0.0002, "epoch": 1.2981298129812981, "step": 1770}, {"loss": 1.1132, "grad_norm": 0.3145381212234497, "learning_rate": 0.0002, "epoch": 1.3054638797213054, "step": 1780}, {"loss": 1.1099, "grad_norm": 0.5116165280342102, "learning_rate": 0.0002, "epoch": 1.3127979464613129, "step": 1790}, {"loss": 1.0765, "grad_norm": 0.7469736337661743, "learning_rate": 0.0002, "epoch": 1.3201320132013201, "step": 1800}, {"loss": 1.0663, "grad_norm": 0.32272255420684814, "learning_rate": 0.0002, "epoch": 1.3274660799413276, "step": 1810}, {"loss": 0.9887, "grad_norm": 0.3534623086452484, "learning_rate": 0.0002, "epoch": 1.3348001466813348, "step": 1820}, {"loss": 1.1628, "grad_norm": 0.36127907037734985, "learning_rate": 0.0002, "epoch": 1.342134213421342, "step": 1830}, {"loss": 1.0972, "grad_norm": 0.4072401523590088, "learning_rate": 0.0002, "epoch": 1.3494682801613496, "step": 1840}, {"loss": 1.1267, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.3568023469013568, "step": 1850}, {"loss": 1.0173, "grad_norm": 0.412883460521698, "learning_rate": 0.0002, "epoch": 1.364136413641364, "step": 1860}, {"loss": 1.0265, "grad_norm": 0.3735875189304352, "learning_rate": 0.0002, "epoch": 1.3714704803813715, "step": 1870}, {"loss": 1.1061, "grad_norm": 0.39158159494400024, "learning_rate": 0.0002, "epoch": 1.3788045471213788, "step": 1880}, {"loss": 1.0433, "grad_norm": 0.44431769847869873, "learning_rate": 0.0002, "epoch": 1.386138613861386, "step": 1890}, {"loss": 1.0216, "grad_norm": 0.37772801518440247, "learning_rate": 0.0002, "epoch": 1.3934726806013935, "step": 1900}, {"loss": 1.0674, "grad_norm": 0.4056641757488251, "learning_rate": 0.0002, "epoch": 1.4008067473414008, "step": 1910}, {"loss": 1.0256, "grad_norm": 0.41612377762794495, "learning_rate": 0.0002, "epoch": 1.408140814081408, "step": 1920}, {"loss": 1.0467, "grad_norm": 0.41153013706207275, "learning_rate": 0.0002, "epoch": 1.4154748808214155, "step": 1930}, {"loss": 1.1062, "grad_norm": 0.387845516204834, "learning_rate": 0.0002, "epoch": 1.4228089475614227, "step": 1940}, {"loss": 1.1094, "grad_norm": 0.3809587061405182, "learning_rate": 0.0002, "epoch": 1.4301430143014302, "step": 1950}, {"loss": 1.0461, "grad_norm": 0.3625726103782654, "learning_rate": 0.0002, "epoch": 1.4374770810414375, "step": 1960}, {"loss": 0.9983, "grad_norm": 0.5294290781021118, "learning_rate": 0.0002, "epoch": 1.444811147781445, "step": 1970}, {"loss": 1.1114, "grad_norm": 0.39975494146347046, "learning_rate": 0.0002, "epoch": 1.4521452145214522, "step": 1980}, {"loss": 0.9704, "grad_norm": 0.4181167185306549, "learning_rate": 0.0002, "epoch": 1.4594792812614594, "step": 1990}, {"loss": 1.1146, "grad_norm": 0.42001503705978394, "learning_rate": 0.0002, "epoch": 1.466813348001467, "step": 2000}, {"loss": 1.1266, "grad_norm": 0.4877578616142273, "learning_rate": 0.0002, "epoch": 1.4741474147414741, "step": 2010}, {"loss": 1.1012, "grad_norm": 0.4050969183444977, "learning_rate": 0.0002, "epoch": 1.4814814814814814, "step": 2020}, {"loss": 1.0562, "grad_norm": 0.39068883657455444, "learning_rate": 0.0002, "epoch": 1.4888155482214889, "step": 2030}, {"loss": 1.0464, "grad_norm": 0.421282559633255, "learning_rate": 0.0002, "epoch": 1.4961496149614961, "step": 2040}, {"loss": 1.0532, "grad_norm": 0.47092297673225403, "learning_rate": 0.0002, "epoch": 1.5034836817015034, "step": 2050}, {"loss": 0.9348, "grad_norm": 0.39688974618911743, "learning_rate": 0.0002, "epoch": 1.5108177484415108, "step": 2060}, {"loss": 1.08, "grad_norm": 0.5529879331588745, "learning_rate": 0.0002, "epoch": 1.5181518151815183, "step": 2070}, {"loss": 1.1836, "grad_norm": 0.4879782199859619, "learning_rate": 0.0002, "epoch": 1.5254858819215253, "step": 2080}, {"loss": 1.0432, "grad_norm": 0.5517361164093018, "learning_rate": 0.0002, "epoch": 1.5328199486615328, "step": 2090}, {"loss": 1.0433, "grad_norm": 0.44015637040138245, "learning_rate": 0.0002, "epoch": 1.5401540154015403, "step": 2100}, {"loss": 1.1873, "grad_norm": 0.5435167551040649, "learning_rate": 0.0002, "epoch": 1.5474880821415475, "step": 2110}, {"loss": 1.1076, "grad_norm": 0.5714033246040344, "learning_rate": 0.0002, "epoch": 1.5548221488815548, "step": 2120}, {"loss": 1.1107, "grad_norm": 0.31732529401779175, "learning_rate": 0.0002, "epoch": 1.5621562156215623, "step": 2130}, {"loss": 1.0817, "grad_norm": 0.49068278074264526, "learning_rate": 0.0002, "epoch": 1.5694902823615695, "step": 2140}, {"loss": 1.0254, "grad_norm": 0.46851542592048645, "learning_rate": 0.0002, "epoch": 1.5768243491015768, "step": 2150}, {"loss": 1.0623, "grad_norm": 0.5083092451095581, "learning_rate": 0.0002, "epoch": 1.5841584158415842, "step": 2160}, {"loss": 1.0603, "grad_norm": 0.9822936058044434, "learning_rate": 0.0002, "epoch": 1.5914924825815915, "step": 2170}, {"loss": 0.9986, "grad_norm": 0.4575989246368408, "learning_rate": 0.0002, "epoch": 1.5988265493215987, "step": 2180}, {"loss": 1.1292, "grad_norm": 0.47444286942481995, "learning_rate": 0.0002, "epoch": 1.6061606160616062, "step": 2190}, {"loss": 1.0136, "grad_norm": 0.7208226919174194, "learning_rate": 0.0002, "epoch": 1.6134946828016135, "step": 2200}, {"loss": 1.15, "grad_norm": 0.43791481852531433, "learning_rate": 0.0002, "epoch": 1.6208287495416207, "step": 2210}, {"loss": 1.0961, "grad_norm": 0.5245792865753174, "learning_rate": 0.0002, "epoch": 1.6281628162816282, "step": 2220}, {"loss": 0.9957, "grad_norm": 0.39289429783821106, "learning_rate": 0.0002, "epoch": 1.6354968830216357, "step": 2230}, {"loss": 1.133, "grad_norm": 0.6106135845184326, "learning_rate": 0.0002, "epoch": 1.6428309497616427, "step": 2240}, {"loss": 1.0129, "grad_norm": 0.3722580671310425, "learning_rate": 0.0002, "epoch": 1.6501650165016502, "step": 2250}, {"loss": 1.0446, "grad_norm": 0.3649403750896454, "learning_rate": 0.0002, "epoch": 1.6574990832416576, "step": 2260}, {"loss": 1.0037, "grad_norm": 0.46514248847961426, "learning_rate": 0.0002, "epoch": 1.6648331499816649, "step": 2270}, {"loss": 1.0022, "grad_norm": 0.42034927010536194, "learning_rate": 0.0002, "epoch": 1.6721672167216721, "step": 2280}, {"loss": 1.1362, "grad_norm": 0.45202910900115967, "learning_rate": 0.0002, "epoch": 1.6795012834616796, "step": 2290}, {"loss": 1.0866, "grad_norm": 0.36257603764533997, "learning_rate": 0.0002, "epoch": 1.6868353502016868, "step": 2300}, {"loss": 1.0973, "grad_norm": 0.6340323090553284, "learning_rate": 0.0002, "epoch": 1.694169416941694, "step": 2310}, {"loss": 1.0615, "grad_norm": 0.4352878928184509, "learning_rate": 0.0002, "epoch": 1.7015034836817016, "step": 2320}, {"loss": 1.0629, "grad_norm": 0.45029792189598083, "learning_rate": 0.0002, "epoch": 1.7088375504217088, "step": 2330}, {"loss": 0.9621, "grad_norm": 0.3891315758228302, "learning_rate": 0.0002, "epoch": 1.716171617161716, "step": 2340}, {"loss": 0.9779, "grad_norm": 0.35180050134658813, "learning_rate": 0.0002, "epoch": 1.7235056839017235, "step": 2350}, {"loss": 1.0368, "grad_norm": 0.42367449402809143, "learning_rate": 0.0002, "epoch": 1.7308397506417308, "step": 2360}, {"loss": 1.0376, "grad_norm": 0.4553675353527069, "learning_rate": 0.0002, "epoch": 1.738173817381738, "step": 2370}, {"loss": 1.1467, "grad_norm": 0.5944654941558838, "learning_rate": 0.0002, "epoch": 1.7455078841217455, "step": 2380}, {"loss": 1.0548, "grad_norm": 0.3479664623737335, "learning_rate": 0.0002, "epoch": 1.752841950861753, "step": 2390}, {"loss": 1.0798, "grad_norm": 0.3585502505302429, "learning_rate": 0.0002, "epoch": 1.76017601760176, "step": 2400}, {"loss": 1.0983, "grad_norm": 0.4263346493244171, "learning_rate": 0.0002, "epoch": 1.7675100843417675, "step": 2410}, {"loss": 1.054, "grad_norm": 0.5476409196853638, "learning_rate": 0.0002, "epoch": 1.774844151081775, "step": 2420}, {"loss": 1.1615, "grad_norm": 0.3694186508655548, "learning_rate": 0.0002, "epoch": 1.7821782178217822, "step": 2430}, {"loss": 1.1343, "grad_norm": 0.9185658693313599, "learning_rate": 0.0002, "epoch": 1.7895122845617895, "step": 2440}, {"loss": 1.0764, "grad_norm": 0.7171908020973206, "learning_rate": 0.0002, "epoch": 1.796846351301797, "step": 2450}, {"loss": 1.1154, "grad_norm": 0.550658643245697, "learning_rate": 0.0002, "epoch": 1.8041804180418042, "step": 2460}, {"loss": 0.9975, "grad_norm": 0.4075568914413452, "learning_rate": 0.0002, "epoch": 1.8115144847818114, "step": 2470}, {"loss": 1.0935, "grad_norm": 0.3790127635002136, "learning_rate": 0.0002, "epoch": 1.818848551521819, "step": 2480}, {"loss": 0.9839, "grad_norm": 0.3576384484767914, "learning_rate": 0.0002, "epoch": 1.8261826182618262, "step": 2490}, {"loss": 1.1369, "grad_norm": 0.3919370770454407, "learning_rate": 0.0002, "epoch": 1.8335166850018334, "step": 2500}, {"loss": 0.9985, "grad_norm": 0.485083669424057, "learning_rate": 0.0002, "epoch": 1.8408507517418409, "step": 2510}, {"loss": 1.1585, "grad_norm": 0.4564347565174103, "learning_rate": 0.0002, "epoch": 1.8481848184818483, "step": 2520}, {"loss": 1.0944, "grad_norm": 0.3613106608390808, "learning_rate": 0.0002, "epoch": 1.8555188852218554, "step": 2530}, {"loss": 1.0819, "grad_norm": 0.39600759744644165, "learning_rate": 0.0002, "epoch": 1.8628529519618628, "step": 2540}, {"loss": 0.9453, "grad_norm": 1.123499870300293, "learning_rate": 0.0002, "epoch": 1.8701870187018703, "step": 2550}, {"loss": 1.0635, "grad_norm": 0.4612680673599243, "learning_rate": 0.0002, "epoch": 1.8775210854418776, "step": 2560}, {"loss": 1.0087, "grad_norm": 0.42745399475097656, "learning_rate": 0.0002, "epoch": 1.8848551521818848, "step": 2570}, {"loss": 1.0102, "grad_norm": 0.4055580198764801, "learning_rate": 0.0002, "epoch": 1.8921892189218923, "step": 2580}, {"loss": 1.0177, "grad_norm": 0.44174644351005554, "learning_rate": 0.0002, "epoch": 1.8995232856618995, "step": 2590}, {"loss": 0.9886, "grad_norm": 1.0228385925292969, "learning_rate": 0.0002, "epoch": 1.9068573524019068, "step": 2600}, {"loss": 1.0857, "grad_norm": 0.3496396243572235, "learning_rate": 0.0002, "epoch": 1.9141914191419143, "step": 2610}, {"loss": 1.0955, "grad_norm": 0.4191173017024994, "learning_rate": 0.0002, "epoch": 1.9215254858819215, "step": 2620}, {"loss": 1.0943, "grad_norm": 0.6778554916381836, "learning_rate": 0.0002, "epoch": 1.9288595526219288, "step": 2630}, {"loss": 1.0594, "grad_norm": 0.41992834210395813, "learning_rate": 0.0002, "epoch": 1.9361936193619362, "step": 2640}, {"loss": 1.1159, "grad_norm": 0.8760401010513306, "learning_rate": 0.0002, "epoch": 1.9435276861019435, "step": 2650}, {"loss": 1.0379, "grad_norm": 0.44049209356307983, "learning_rate": 0.0002, "epoch": 1.9508617528419507, "step": 2660}, {"loss": 1.1008, "grad_norm": 0.5651928782463074, "learning_rate": 0.0002, "epoch": 1.9581958195819582, "step": 2670}, {"loss": 1.1317, "grad_norm": 0.5292727947235107, "learning_rate": 0.0002, "epoch": 1.9655298863219657, "step": 2680}, {"loss": 1.1328, "grad_norm": 0.6012240648269653, "learning_rate": 0.0002, "epoch": 1.9728639530619727, "step": 2690}, {"loss": 1.0683, "grad_norm": 0.3945149779319763, "learning_rate": 0.0002, "epoch": 1.9801980198019802, "step": 2700}, {"loss": 1.0155, "grad_norm": 0.5732627511024475, "learning_rate": 0.0002, "epoch": 1.9875320865419877, "step": 2710}, {"loss": 0.9857, "grad_norm": 0.3963361084461212, "learning_rate": 0.0002, "epoch": 1.994866153281995, "step": 2720}, {"eval_loss": 1.1534006595611572, "eval_runtime": 32.7541, "eval_samples_per_second": 13.159, "eval_steps_per_second": 1.649, "epoch": 2.0, "step": 2727}, {"loss": 0.9624, "grad_norm": 0.48628315329551697, "learning_rate": 0.0002, "epoch": 2.002200220022002, "step": 2730}, {"loss": 0.9603, "grad_norm": 0.413875013589859, "learning_rate": 0.0002, "epoch": 2.0095342867620096, "step": 2740}, {"loss": 0.965, "grad_norm": 0.4988735616207123, "learning_rate": 0.0002, "epoch": 2.0168683535020167, "step": 2750}, {"loss": 0.9677, "grad_norm": 0.5634812712669373, "learning_rate": 0.0002, "epoch": 2.024202420242024, "step": 2760}, {"loss": 0.9547, "grad_norm": 0.48302653431892395, "learning_rate": 0.0002, "epoch": 2.0315364869820316, "step": 2770}, {"loss": 0.9346, "grad_norm": 0.49914175271987915, "learning_rate": 0.0002, "epoch": 2.038870553722039, "step": 2780}, {"loss": 0.904, "grad_norm": 1.14039945602417, "learning_rate": 0.0002, "epoch": 2.046204620462046, "step": 2790}, {"loss": 0.9588, "grad_norm": 0.6359720826148987, "learning_rate": 0.0002, "epoch": 2.0535386872020536, "step": 2800}, {"loss": 0.9031, "grad_norm": 0.4589158296585083, "learning_rate": 0.0002, "epoch": 2.060872753942061, "step": 2810}, {"loss": 0.9438, "grad_norm": 0.46255481243133545, "learning_rate": 0.0002, "epoch": 2.068206820682068, "step": 2820}, {"loss": 0.9464, "grad_norm": 0.6232137680053711, "learning_rate": 0.0002, "epoch": 2.0755408874220755, "step": 2830}, {"loss": 0.8978, "grad_norm": 0.41042178869247437, "learning_rate": 0.0002, "epoch": 2.082874954162083, "step": 2840}, {"loss": 0.8516, "grad_norm": 0.5334428548812866, "learning_rate": 0.0002, "epoch": 2.09020902090209, "step": 2850}, {"loss": 0.9313, "grad_norm": 0.8270058631896973, "learning_rate": 0.0002, "epoch": 2.0975430876420975, "step": 2860}, {"loss": 1.0064, "grad_norm": 0.6624533534049988, "learning_rate": 0.0002, "epoch": 2.104877154382105, "step": 2870}, {"loss": 0.9196, "grad_norm": 0.5448863506317139, "learning_rate": 0.0002, "epoch": 2.112211221122112, "step": 2880}, {"loss": 0.887, "grad_norm": 0.621482789516449, "learning_rate": 0.0002, "epoch": 2.1195452878621195, "step": 2890}, {"loss": 0.9702, "grad_norm": 0.4556255340576172, "learning_rate": 0.0002, "epoch": 2.126879354602127, "step": 2900}, {"loss": 0.9323, "grad_norm": 0.4620579183101654, "learning_rate": 0.0002, "epoch": 2.1342134213421344, "step": 2910}, {"loss": 0.836, "grad_norm": 0.9602415561676025, "learning_rate": 0.0002, "epoch": 2.1415474880821415, "step": 2920}, {"loss": 0.8826, "grad_norm": 0.587943971157074, "learning_rate": 0.0002, "epoch": 2.148881554822149, "step": 2930}, {"loss": 0.971, "grad_norm": 0.5121372938156128, "learning_rate": 0.0002, "epoch": 2.1562156215621564, "step": 2940}, {"loss": 0.8751, "grad_norm": 0.49424484372138977, "learning_rate": 0.0002, "epoch": 2.1635496883021634, "step": 2950}, {"loss": 0.8674, "grad_norm": 0.6312560439109802, "learning_rate": 0.0002, "epoch": 2.170883755042171, "step": 2960}, {"loss": 0.9791, "grad_norm": 0.5235576629638672, "learning_rate": 0.0002, "epoch": 2.1782178217821784, "step": 2970}, {"loss": 0.9706, "grad_norm": 0.5868439674377441, "learning_rate": 0.0002, "epoch": 2.1855518885221854, "step": 2980}, {"loss": 0.9338, "grad_norm": 0.42302873730659485, "learning_rate": 0.0002, "epoch": 2.192885955262193, "step": 2990}, {"loss": 0.9332, "grad_norm": 0.5097725987434387, "learning_rate": 0.0002, "epoch": 2.2002200220022003, "step": 3000}, {"loss": 0.9239, "grad_norm": 0.5091572403907776, "learning_rate": 0.0002, "epoch": 2.2075540887422074, "step": 3010}, {"loss": 0.8898, "grad_norm": 0.49433162808418274, "learning_rate": 0.0002, "epoch": 2.214888155482215, "step": 3020}, {"loss": 0.9734, "grad_norm": 0.5577368140220642, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3030}, {"loss": 0.9033, "grad_norm": 0.6177583932876587, "learning_rate": 0.0002, "epoch": 2.2295562889622293, "step": 3040}, {"loss": 0.9882, "grad_norm": 0.5256719589233398, "learning_rate": 0.0002, "epoch": 2.236890355702237, "step": 3050}, {"loss": 0.9439, "grad_norm": 0.5001118183135986, "learning_rate": 0.0002, "epoch": 2.2442244224422443, "step": 3060}, {"loss": 0.8718, "grad_norm": 0.5721249580383301, "learning_rate": 0.0002, "epoch": 2.2515584891822513, "step": 3070}, {"loss": 1.0648, "grad_norm": 0.5325384140014648, "learning_rate": 0.0002, "epoch": 2.258892555922259, "step": 3080}, {"loss": 0.9843, "grad_norm": 0.5719189047813416, "learning_rate": 0.0002, "epoch": 2.2662266226622663, "step": 3090}, {"loss": 0.8633, "grad_norm": 0.6337835788726807, "learning_rate": 0.0002, "epoch": 2.2735606894022737, "step": 3100}, {"loss": 0.9962, "grad_norm": 0.5381836891174316, "learning_rate": 0.0002, "epoch": 2.2808947561422808, "step": 3110}, {"loss": 0.8265, "grad_norm": 0.5408531427383423, "learning_rate": 0.0002, "epoch": 2.2882288228822882, "step": 3120}, {"loss": 1.0325, "grad_norm": 0.43705281615257263, "learning_rate": 0.0002, "epoch": 2.2955628896222957, "step": 3130}, {"loss": 0.9388, "grad_norm": 0.6454030275344849, "learning_rate": 0.0002, "epoch": 2.3028969563623027, "step": 3140}, {"loss": 0.954, "grad_norm": 0.686030387878418, "learning_rate": 0.0002, "epoch": 2.31023102310231, "step": 3150}, {"loss": 0.9403, "grad_norm": 0.5123633146286011, "learning_rate": 0.0002, "epoch": 2.3175650898423177, "step": 3160}, {"loss": 0.8834, "grad_norm": 0.842506468296051, "learning_rate": 0.0002, "epoch": 2.3248991565823247, "step": 3170}, {"loss": 1.0497, "grad_norm": 0.5193818807601929, "learning_rate": 0.0002, "epoch": 2.332233223322332, "step": 3180}, {"loss": 0.9473, "grad_norm": 0.5634409189224243, "learning_rate": 0.0002, "epoch": 2.3395672900623397, "step": 3190}, {"loss": 0.8499, "grad_norm": 0.6475534439086914, "learning_rate": 0.0002, "epoch": 2.3469013568023467, "step": 3200}, {"loss": 0.874, "grad_norm": 1.1503914594650269, "learning_rate": 0.0002, "epoch": 2.354235423542354, "step": 3210}, {"loss": 0.9762, "grad_norm": 0.7234905362129211, "learning_rate": 0.0002, "epoch": 2.3615694902823616, "step": 3220}, {"loss": 0.9007, "grad_norm": 0.664903461933136, "learning_rate": 0.0002, "epoch": 2.368903557022369, "step": 3230}, {"loss": 0.9987, "grad_norm": 0.5453006625175476, "learning_rate": 0.0002, "epoch": 2.376237623762376, "step": 3240}, {"loss": 0.9742, "grad_norm": 0.6256654262542725, "learning_rate": 0.0002, "epoch": 2.3835716905023836, "step": 3250}, {"loss": 0.9922, "grad_norm": 0.5166565179824829, "learning_rate": 0.0002, "epoch": 2.390905757242391, "step": 3260}, {"loss": 0.927, "grad_norm": 0.5699098110198975, "learning_rate": 0.0002, "epoch": 2.398239823982398, "step": 3270}, {"loss": 0.8878, "grad_norm": 0.4472540020942688, "learning_rate": 0.0002, "epoch": 2.4055738907224056, "step": 3280}, {"loss": 0.9439, "grad_norm": 0.6790403127670288, "learning_rate": 0.0002, "epoch": 2.412907957462413, "step": 3290}, {"loss": 0.972, "grad_norm": 0.5182185173034668, "learning_rate": 0.0002, "epoch": 2.42024202420242, "step": 3300}, {"loss": 0.9775, "grad_norm": 0.564647912979126, "learning_rate": 0.0002, "epoch": 2.4275760909424275, "step": 3310}, {"loss": 1.072, "grad_norm": 0.5625313520431519, "learning_rate": 0.0002, "epoch": 2.434910157682435, "step": 3320}, {"loss": 0.8798, "grad_norm": 0.7496559619903564, "learning_rate": 0.0002, "epoch": 2.442244224422442, "step": 3330}, {"loss": 0.868, "grad_norm": 0.4779128134250641, "learning_rate": 0.0002, "epoch": 2.4495782911624495, "step": 3340}, {"loss": 1.0316, "grad_norm": 0.578093409538269, "learning_rate": 0.0002, "epoch": 2.456912357902457, "step": 3350}, {"loss": 0.9282, "grad_norm": 0.5456080436706543, "learning_rate": 0.0002, "epoch": 2.4642464246424645, "step": 3360}, {"loss": 0.8409, "grad_norm": 0.4769273102283478, "learning_rate": 0.0002, "epoch": 2.4715804913824715, "step": 3370}, {"loss": 0.9312, "grad_norm": 0.5608189702033997, "learning_rate": 0.0002, "epoch": 2.478914558122479, "step": 3380}, {"loss": 0.9934, "grad_norm": 0.5590165853500366, "learning_rate": 0.0002, "epoch": 2.4862486248624864, "step": 3390}, {"loss": 1.025, "grad_norm": 0.801306962966919, "learning_rate": 0.0002, "epoch": 2.4935826916024935, "step": 3400}, {"loss": 0.9049, "grad_norm": 0.6045624613761902, "learning_rate": 0.0002, "epoch": 2.500916758342501, "step": 3410}, {"loss": 0.944, "grad_norm": 0.5735858082771301, "learning_rate": 0.0002, "epoch": 2.5082508250825084, "step": 3420}, {"loss": 0.9846, "grad_norm": 0.6827309131622314, "learning_rate": 0.0002, "epoch": 2.5155848918225154, "step": 3430}, {"loss": 0.9789, "grad_norm": 0.5702602863311768, "learning_rate": 0.0002, "epoch": 2.522918958562523, "step": 3440}, {"loss": 0.9127, "grad_norm": 0.6674721240997314, "learning_rate": 0.0002, "epoch": 2.5302530253025304, "step": 3450}, {"loss": 0.914, "grad_norm": 0.5635907649993896, "learning_rate": 0.0002, "epoch": 2.5375870920425374, "step": 3460}, {"loss": 0.8398, "grad_norm": 0.42737770080566406, "learning_rate": 0.0002, "epoch": 2.544921158782545, "step": 3470}, {"loss": 0.9474, "grad_norm": 0.6720691919326782, "learning_rate": 0.0002, "epoch": 2.5522552255225524, "step": 3480}, {"loss": 0.8637, "grad_norm": 0.8917084336280823, "learning_rate": 0.0002, "epoch": 2.55958929226256, "step": 3490}, {"loss": 0.9257, "grad_norm": 0.5134549140930176, "learning_rate": 0.0002, "epoch": 2.566923359002567, "step": 3500}, {"loss": 0.9362, "grad_norm": 0.4951367974281311, "learning_rate": 0.0002, "epoch": 2.5742574257425743, "step": 3510}, {"loss": 0.9184, "grad_norm": 0.9438204765319824, "learning_rate": 0.0002, "epoch": 2.5815914924825814, "step": 3520}, {"loss": 0.8939, "grad_norm": 0.6024714708328247, "learning_rate": 0.0002, "epoch": 2.588925559222589, "step": 3530}, {"loss": 0.9298, "grad_norm": 0.5248535871505737, "learning_rate": 0.0002, "epoch": 2.5962596259625963, "step": 3540}, {"loss": 0.941, "grad_norm": 0.8677568435668945, "learning_rate": 0.0002, "epoch": 2.6035936927026038, "step": 3550}, {"loss": 0.9253, "grad_norm": 0.82008296251297, "learning_rate": 0.0002, "epoch": 2.610927759442611, "step": 3560}, {"loss": 0.8429, "grad_norm": 0.4724634885787964, "learning_rate": 0.0002, "epoch": 2.6182618261826183, "step": 3570}, {"loss": 0.9058, "grad_norm": 0.5434244275093079, "learning_rate": 0.0002, "epoch": 2.6255958929226257, "step": 3580}, {"loss": 0.9379, "grad_norm": 0.4948740005493164, "learning_rate": 0.0002, "epoch": 2.6329299596626328, "step": 3590}, {"loss": 0.8718, "grad_norm": 0.42109328508377075, "learning_rate": 0.0002, "epoch": 2.6402640264026402, "step": 3600}, {"loss": 0.9809, "grad_norm": 0.7979786396026611, "learning_rate": 0.0002, "epoch": 2.6475980931426477, "step": 3610}, {"loss": 0.9229, "grad_norm": 0.6345919370651245, "learning_rate": 0.0002, "epoch": 2.654932159882655, "step": 3620}, {"loss": 0.8506, "grad_norm": 0.4971671402454376, "learning_rate": 0.0002, "epoch": 2.662266226622662, "step": 3630}, {"loss": 0.8054, "grad_norm": 0.6467748284339905, "learning_rate": 0.0002, "epoch": 2.6696002933626697, "step": 3640}, {"loss": 0.9277, "grad_norm": 0.4240160286426544, "learning_rate": 0.0002, "epoch": 2.6769343601026767, "step": 3650}, {"loss": 0.8213, "grad_norm": 0.5179754495620728, "learning_rate": 0.0002, "epoch": 2.684268426842684, "step": 3660}, {"loss": 0.9221, "grad_norm": 0.754012405872345, "learning_rate": 0.0002, "epoch": 2.6916024935826917, "step": 3670}, {"loss": 0.9194, "grad_norm": 0.5141299962997437, "learning_rate": 0.0002, "epoch": 2.698936560322699, "step": 3680}, {"loss": 0.9495, "grad_norm": 0.5737819075584412, "learning_rate": 0.0002, "epoch": 2.706270627062706, "step": 3690}, {"loss": 1.0162, "grad_norm": 0.5887577533721924, "learning_rate": 0.0002, "epoch": 2.7136046938027136, "step": 3700}, {"loss": 0.9169, "grad_norm": 0.6740471720695496, "learning_rate": 0.0002, "epoch": 2.720938760542721, "step": 3710}, {"loss": 0.9297, "grad_norm": 0.5879453420639038, "learning_rate": 0.0002, "epoch": 2.728272827282728, "step": 3720}, {"loss": 0.9358, "grad_norm": 0.4858354926109314, "learning_rate": 0.0002, "epoch": 2.7356068940227356, "step": 3730}, {"loss": 0.9308, "grad_norm": 0.5489001870155334, "learning_rate": 0.0002, "epoch": 2.742940960762743, "step": 3740}, {"loss": 0.894, "grad_norm": 0.8187092542648315, "learning_rate": 0.0002, "epoch": 2.7502750275027505, "step": 3750}, {"loss": 0.8954, "grad_norm": 0.5666626691818237, "learning_rate": 0.0002, "epoch": 2.7576090942427576, "step": 3760}, {"loss": 1.0059, "grad_norm": 0.5377066135406494, "learning_rate": 0.0002, "epoch": 2.764943160982765, "step": 3770}, {"loss": 0.9132, "grad_norm": 0.566330075263977, "learning_rate": 0.0002, "epoch": 2.772277227722772, "step": 3780}, {"loss": 0.9415, "grad_norm": 0.5522832870483398, "learning_rate": 0.0002, "epoch": 2.7796112944627795, "step": 3790}, {"loss": 0.8816, "grad_norm": 0.5668695569038391, "learning_rate": 0.0002, "epoch": 2.786945361202787, "step": 3800}, {"loss": 0.8885, "grad_norm": 0.7566602826118469, "learning_rate": 0.0002, "epoch": 2.7942794279427945, "step": 3810}, {"loss": 0.8598, "grad_norm": 0.5603684782981873, "learning_rate": 0.0002, "epoch": 2.8016134946828015, "step": 3820}, {"loss": 0.9602, "grad_norm": 0.49122217297554016, "learning_rate": 0.0002, "epoch": 2.808947561422809, "step": 3830}, {"loss": 0.9738, "grad_norm": 0.6798251867294312, "learning_rate": 0.0002, "epoch": 2.816281628162816, "step": 3840}, {"loss": 0.9533, "grad_norm": 0.6097991466522217, "learning_rate": 0.0002, "epoch": 2.8236156949028235, "step": 3850}, {"loss": 0.8672, "grad_norm": 0.6675726175308228, "learning_rate": 0.0002, "epoch": 2.830949761642831, "step": 3860}, {"loss": 0.9324, "grad_norm": 0.9223952889442444, "learning_rate": 0.0002, "epoch": 2.8382838283828384, "step": 3870}, {"loss": 0.8767, "grad_norm": 0.6020799875259399, "learning_rate": 0.0002, "epoch": 2.8456178951228455, "step": 3880}, {"loss": 0.9148, "grad_norm": 0.5206381678581238, "learning_rate": 0.0002, "epoch": 2.852951961862853, "step": 3890}, {"loss": 0.9479, "grad_norm": 0.6268777251243591, "learning_rate": 0.0002, "epoch": 2.8602860286028604, "step": 3900}, {"loss": 0.9409, "grad_norm": 1.1583497524261475, "learning_rate": 0.0002, "epoch": 2.8676200953428674, "step": 3910}, {"loss": 0.895, "grad_norm": 0.7263903021812439, "learning_rate": 0.0002, "epoch": 2.874954162082875, "step": 3920}, {"loss": 0.8786, "grad_norm": 0.5369910001754761, "learning_rate": 0.0002, "epoch": 2.8822882288228824, "step": 3930}, {"loss": 1.0015, "grad_norm": 0.7298350930213928, "learning_rate": 0.0002, "epoch": 2.88962229556289, "step": 3940}, {"loss": 0.979, "grad_norm": 0.577012836933136, "learning_rate": 0.0002, "epoch": 2.896956362302897, "step": 3950}, {"loss": 0.9716, "grad_norm": 0.5859594345092773, "learning_rate": 0.0002, "epoch": 2.9042904290429044, "step": 3960}, {"loss": 0.8772, "grad_norm": 0.47176122665405273, "learning_rate": 0.0002, "epoch": 2.9116244957829114, "step": 3970}, {"loss": 0.8997, "grad_norm": 0.9699620604515076, "learning_rate": 0.0002, "epoch": 2.918958562522919, "step": 3980}, {"loss": 0.9057, "grad_norm": 0.7908747792243958, "learning_rate": 0.0002, "epoch": 2.9262926292629263, "step": 3990}, {"loss": 0.9462, "grad_norm": 0.5777379274368286, "learning_rate": 0.0002, "epoch": 2.933626696002934, "step": 4000}, {"loss": 0.9358, "grad_norm": 0.599288284778595, "learning_rate": 0.0002, "epoch": 2.940960762742941, "step": 4010}, {"loss": 0.9812, "grad_norm": 0.5232274532318115, "learning_rate": 0.0002, "epoch": 2.9482948294829483, "step": 4020}, {"loss": 0.96, "grad_norm": 0.6395137310028076, "learning_rate": 0.0002, "epoch": 2.9556288962229558, "step": 4030}, {"loss": 0.9813, "grad_norm": 0.589260458946228, "learning_rate": 0.0002, "epoch": 2.962962962962963, "step": 4040}, {"loss": 0.9541, "grad_norm": 0.5699581503868103, "learning_rate": 0.0002, "epoch": 2.9702970297029703, "step": 4050}, {"loss": 0.9585, "grad_norm": 0.528468132019043, "learning_rate": 0.0002, "epoch": 2.9776310964429777, "step": 4060}, {"loss": 0.9164, "grad_norm": 0.4804670512676239, "learning_rate": 0.0002, "epoch": 2.984965163182985, "step": 4070}, {"loss": 0.9771, "grad_norm": 1.1918889284133911, "learning_rate": 0.0002, "epoch": 2.9922992299229922, "step": 4080}, {"loss": 0.9178, "grad_norm": 0.5479103326797485, "learning_rate": 0.0002, "epoch": 2.9996332966629997, "step": 4090}]} +{"epoch": 4.0, "step": 5454, "epoch_duration": 1471.1494228839874, "total_accumulated_duration": 5905.44949555397, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9722, "grad_norm": 0.47521963715553284, "learning_rate": 0.0002, "epoch": 0.007334066740007334, "step": 10}, {"loss": 1.4821, "grad_norm": 0.5395162105560303, "learning_rate": 0.0002, "epoch": 0.014668133480014669, "step": 20}, {"loss": 1.4202, "grad_norm": 0.4305780231952667, "learning_rate": 0.0002, "epoch": 0.022002200220022004, "step": 30}, {"loss": 1.4271, "grad_norm": 0.6938246488571167, "learning_rate": 0.0002, "epoch": 0.029336266960029337, "step": 40}, {"loss": 1.3112, "grad_norm": 1.5133819580078125, "learning_rate": 0.0002, "epoch": 0.03667033370003667, "step": 50}, {"loss": 1.3132, "grad_norm": 0.9173883199691772, "learning_rate": 0.0002, "epoch": 0.04400440044004401, "step": 60}, {"loss": 1.2844, "grad_norm": 0.4619861841201782, "learning_rate": 0.0002, "epoch": 0.05133846718005134, "step": 70}, {"loss": 1.2108, "grad_norm": 0.46118637919425964, "learning_rate": 0.0002, "epoch": 0.058672533920058674, "step": 80}, {"loss": 1.3441, "grad_norm": 0.4468648135662079, "learning_rate": 0.0002, "epoch": 0.066006600660066, "step": 90}, {"loss": 1.1863, "grad_norm": 0.46123769879341125, "learning_rate": 0.0002, "epoch": 0.07334066740007333, "step": 100}, {"loss": 1.2772, "grad_norm": 0.4859139025211334, "learning_rate": 0.0002, "epoch": 0.08067473414008068, "step": 110}, {"loss": 1.2087, "grad_norm": 0.4384922385215759, "learning_rate": 0.0002, "epoch": 0.08800880088008801, "step": 120}, {"loss": 1.2927, "grad_norm": 0.39519360661506653, "learning_rate": 0.0002, "epoch": 0.09534286762009535, "step": 130}, {"loss": 1.2349, "grad_norm": 0.4049859344959259, "learning_rate": 0.0002, "epoch": 0.10267693436010268, "step": 140}, {"loss": 1.293, "grad_norm": 0.4605638086795807, "learning_rate": 0.0002, "epoch": 0.11001100110011001, "step": 150}, {"loss": 1.2659, "grad_norm": 0.4201928377151489, "learning_rate": 0.0002, "epoch": 0.11734506784011735, "step": 160}, {"loss": 1.3961, "grad_norm": 0.5367777347564697, "learning_rate": 0.0002, "epoch": 0.12467913458012468, "step": 170}, {"loss": 1.2481, "grad_norm": 0.41752299666404724, "learning_rate": 0.0002, "epoch": 0.132013201320132, "step": 180}, {"loss": 1.207, "grad_norm": 0.31597763299942017, "learning_rate": 0.0002, "epoch": 0.13934726806013933, "step": 190}, {"loss": 1.2441, "grad_norm": 0.7468788623809814, "learning_rate": 0.0002, "epoch": 0.14668133480014667, "step": 200}, {"loss": 1.199, "grad_norm": 0.3403034508228302, "learning_rate": 0.0002, "epoch": 0.15401540154015403, "step": 210}, {"loss": 1.2439, "grad_norm": 0.34240293502807617, "learning_rate": 0.0002, "epoch": 0.16134946828016136, "step": 220}, {"loss": 1.2022, "grad_norm": 0.356158971786499, "learning_rate": 0.0002, "epoch": 0.1686835350201687, "step": 230}, {"loss": 1.207, "grad_norm": 0.3448857367038727, "learning_rate": 0.0002, "epoch": 0.17601760176017603, "step": 240}, {"loss": 1.2156, "grad_norm": 0.3475699722766876, "learning_rate": 0.0002, "epoch": 0.18335166850018336, "step": 250}, {"loss": 1.1551, "grad_norm": 0.2770358622074127, "learning_rate": 0.0002, "epoch": 0.1906857352401907, "step": 260}, {"loss": 1.2238, "grad_norm": 0.4310270845890045, "learning_rate": 0.0002, "epoch": 0.19801980198019803, "step": 270}, {"loss": 1.2917, "grad_norm": 0.335041880607605, "learning_rate": 0.0002, "epoch": 0.20535386872020536, "step": 280}, {"loss": 1.0959, "grad_norm": 0.3420602083206177, "learning_rate": 0.0002, "epoch": 0.2126879354602127, "step": 290}, {"loss": 1.1232, "grad_norm": 0.325001060962677, "learning_rate": 0.0002, "epoch": 0.22002200220022003, "step": 300}, {"loss": 1.2007, "grad_norm": 0.3027827739715576, "learning_rate": 0.0002, "epoch": 0.22735606894022736, "step": 310}, {"loss": 1.1803, "grad_norm": 0.435550719499588, "learning_rate": 0.0002, "epoch": 0.2346901356802347, "step": 320}, {"loss": 1.2045, "grad_norm": 0.3884522616863251, "learning_rate": 0.0002, "epoch": 0.24202420242024203, "step": 330}, {"loss": 1.2481, "grad_norm": 0.7736002206802368, "learning_rate": 0.0002, "epoch": 0.24935826916024936, "step": 340}, {"loss": 1.3606, "grad_norm": 0.35052821040153503, "learning_rate": 0.0002, "epoch": 0.2566923359002567, "step": 350}, {"loss": 1.2129, "grad_norm": 0.3311890959739685, "learning_rate": 0.0002, "epoch": 0.264026402640264, "step": 360}, {"loss": 1.2219, "grad_norm": 0.7473500370979309, "learning_rate": 0.0002, "epoch": 0.27136046938027136, "step": 370}, {"loss": 1.2712, "grad_norm": 0.3681875765323639, "learning_rate": 0.0002, "epoch": 0.27869453612027867, "step": 380}, {"loss": 1.2258, "grad_norm": 0.3764737844467163, "learning_rate": 0.0002, "epoch": 0.28602860286028603, "step": 390}, {"loss": 1.1917, "grad_norm": 0.4243989586830139, "learning_rate": 0.0002, "epoch": 0.29336266960029334, "step": 400}, {"loss": 1.199, "grad_norm": 0.2658531963825226, "learning_rate": 0.0002, "epoch": 0.3006967363403007, "step": 410}, {"loss": 1.1622, "grad_norm": 0.3436793386936188, "learning_rate": 0.0002, "epoch": 0.30803080308030806, "step": 420}, {"loss": 1.2953, "grad_norm": 0.5101129412651062, "learning_rate": 0.0002, "epoch": 0.31536486982031536, "step": 430}, {"loss": 1.1557, "grad_norm": 0.3319750726222992, "learning_rate": 0.0002, "epoch": 0.3226989365603227, "step": 440}, {"loss": 1.1804, "grad_norm": 0.385148286819458, "learning_rate": 0.0002, "epoch": 0.33003300330033003, "step": 450}, {"loss": 1.1808, "grad_norm": 0.3477935791015625, "learning_rate": 0.0002, "epoch": 0.3373670700403374, "step": 460}, {"loss": 1.1877, "grad_norm": 0.29748716950416565, "learning_rate": 0.0002, "epoch": 0.3447011367803447, "step": 470}, {"loss": 1.19, "grad_norm": 0.34083324670791626, "learning_rate": 0.0002, "epoch": 0.35203520352035206, "step": 480}, {"loss": 1.2, "grad_norm": 0.36904552578926086, "learning_rate": 0.0002, "epoch": 0.35936927026035936, "step": 490}, {"loss": 1.2223, "grad_norm": 0.315483033657074, "learning_rate": 0.0002, "epoch": 0.3667033370003667, "step": 500}, {"loss": 1.1461, "grad_norm": 0.44897955656051636, "learning_rate": 0.0002, "epoch": 0.37403740374037403, "step": 510}, {"loss": 1.3035, "grad_norm": 0.3160701394081116, "learning_rate": 0.0002, "epoch": 0.3813714704803814, "step": 520}, {"loss": 1.3197, "grad_norm": 0.29584741592407227, "learning_rate": 0.0002, "epoch": 0.3887055372203887, "step": 530}, {"loss": 1.2983, "grad_norm": 0.5430002808570862, "learning_rate": 0.0002, "epoch": 0.39603960396039606, "step": 540}, {"loss": 1.2459, "grad_norm": 0.2908070683479309, "learning_rate": 0.0002, "epoch": 0.40337367070040336, "step": 550}, {"loss": 1.2384, "grad_norm": 0.35066530108451843, "learning_rate": 0.0002, "epoch": 0.4107077374404107, "step": 560}, {"loss": 1.1784, "grad_norm": 0.37588003277778625, "learning_rate": 0.0002, "epoch": 0.41804180418041803, "step": 570}, {"loss": 1.2334, "grad_norm": 0.3112126886844635, "learning_rate": 0.0002, "epoch": 0.4253758709204254, "step": 580}, {"loss": 1.1439, "grad_norm": 0.35577139258384705, "learning_rate": 0.0002, "epoch": 0.4327099376604327, "step": 590}, {"loss": 1.184, "grad_norm": 0.31706422567367554, "learning_rate": 0.0002, "epoch": 0.44004400440044006, "step": 600}, {"loss": 1.2081, "grad_norm": 0.3249092102050781, "learning_rate": 0.0002, "epoch": 0.44737807114044736, "step": 610}, {"loss": 1.0824, "grad_norm": 0.3842705488204956, "learning_rate": 0.0002, "epoch": 0.4547121378804547, "step": 620}, {"loss": 1.2257, "grad_norm": 0.390991747379303, "learning_rate": 0.0002, "epoch": 0.46204620462046203, "step": 630}, {"loss": 1.1954, "grad_norm": 0.27532413601875305, "learning_rate": 0.0002, "epoch": 0.4693802713604694, "step": 640}, {"loss": 1.1058, "grad_norm": 0.31412816047668457, "learning_rate": 0.0002, "epoch": 0.4767143381004767, "step": 650}, {"loss": 1.1312, "grad_norm": 0.32117101550102234, "learning_rate": 0.0002, "epoch": 0.48404840484048406, "step": 660}, {"loss": 1.2423, "grad_norm": 0.3810010254383087, "learning_rate": 0.0002, "epoch": 0.49138247158049136, "step": 670}, {"loss": 1.1978, "grad_norm": 0.36289164423942566, "learning_rate": 0.0002, "epoch": 0.4987165383204987, "step": 680}, {"loss": 1.2034, "grad_norm": 0.34458720684051514, "learning_rate": 0.0002, "epoch": 0.506050605060506, "step": 690}, {"loss": 1.1756, "grad_norm": 0.32844600081443787, "learning_rate": 0.0002, "epoch": 0.5133846718005134, "step": 700}, {"loss": 1.0807, "grad_norm": 0.3144175708293915, "learning_rate": 0.0002, "epoch": 0.5207187385405208, "step": 710}, {"loss": 1.1952, "grad_norm": 0.3898887634277344, "learning_rate": 0.0002, "epoch": 0.528052805280528, "step": 720}, {"loss": 1.1244, "grad_norm": 1.3220758438110352, "learning_rate": 0.0002, "epoch": 0.5353868720205354, "step": 730}, {"loss": 1.227, "grad_norm": 0.3635874390602112, "learning_rate": 0.0002, "epoch": 0.5427209387605427, "step": 740}, {"loss": 1.2169, "grad_norm": 0.3138217628002167, "learning_rate": 0.0002, "epoch": 0.5500550055005501, "step": 750}, {"loss": 1.1516, "grad_norm": 0.4063207805156708, "learning_rate": 0.0002, "epoch": 0.5573890722405573, "step": 760}, {"loss": 1.1954, "grad_norm": 0.3926219940185547, "learning_rate": 0.0002, "epoch": 0.5647231389805647, "step": 770}, {"loss": 1.1726, "grad_norm": 0.31954652070999146, "learning_rate": 0.0002, "epoch": 0.5720572057205721, "step": 780}, {"loss": 1.2977, "grad_norm": 0.4248711168766022, "learning_rate": 0.0002, "epoch": 0.5793912724605794, "step": 790}, {"loss": 1.1728, "grad_norm": 0.643004834651947, "learning_rate": 0.0002, "epoch": 0.5867253392005867, "step": 800}, {"loss": 1.1793, "grad_norm": 0.3479592800140381, "learning_rate": 0.0002, "epoch": 0.594059405940594, "step": 810}, {"loss": 1.2426, "grad_norm": 0.4684754014015198, "learning_rate": 0.0002, "epoch": 0.6013934726806014, "step": 820}, {"loss": 1.2002, "grad_norm": 0.3739790916442871, "learning_rate": 0.0002, "epoch": 0.6087275394206088, "step": 830}, {"loss": 1.2139, "grad_norm": 0.40884748101234436, "learning_rate": 0.0002, "epoch": 0.6160616061606161, "step": 840}, {"loss": 1.1557, "grad_norm": 0.9722164273262024, "learning_rate": 0.0002, "epoch": 0.6233956729006234, "step": 850}, {"loss": 1.3069, "grad_norm": 0.42992347478866577, "learning_rate": 0.0002, "epoch": 0.6307297396406307, "step": 860}, {"loss": 1.1339, "grad_norm": 0.36654195189476013, "learning_rate": 0.0002, "epoch": 0.6380638063806381, "step": 870}, {"loss": 1.1932, "grad_norm": 0.4113832116127014, "learning_rate": 0.0002, "epoch": 0.6453978731206454, "step": 880}, {"loss": 1.2163, "grad_norm": 0.2948838770389557, "learning_rate": 0.0002, "epoch": 0.6527319398606527, "step": 890}, {"loss": 1.1081, "grad_norm": 0.38330280780792236, "learning_rate": 0.0002, "epoch": 0.6600660066006601, "step": 900}, {"loss": 1.1342, "grad_norm": 0.4428867697715759, "learning_rate": 0.0002, "epoch": 0.6674000733406674, "step": 910}, {"loss": 1.1021, "grad_norm": 0.23659265041351318, "learning_rate": 0.0002, "epoch": 0.6747341400806748, "step": 920}, {"loss": 1.1226, "grad_norm": 0.323685884475708, "learning_rate": 0.0002, "epoch": 0.682068206820682, "step": 930}, {"loss": 1.0853, "grad_norm": 0.39157727360725403, "learning_rate": 0.0002, "epoch": 0.6894022735606894, "step": 940}, {"loss": 1.1435, "grad_norm": 0.27189481258392334, "learning_rate": 0.0002, "epoch": 0.6967363403006968, "step": 950}, {"loss": 1.1033, "grad_norm": 0.529883861541748, "learning_rate": 0.0002, "epoch": 0.7040704070407041, "step": 960}, {"loss": 1.139, "grad_norm": 0.34758689999580383, "learning_rate": 0.0002, "epoch": 0.7114044737807114, "step": 970}, {"loss": 1.2197, "grad_norm": 0.831749439239502, "learning_rate": 0.0002, "epoch": 0.7187385405207187, "step": 980}, {"loss": 1.158, "grad_norm": 0.4438304007053375, "learning_rate": 0.0002, "epoch": 0.7260726072607261, "step": 990}, {"loss": 1.1021, "grad_norm": 0.33840006589889526, "learning_rate": 0.0002, "epoch": 0.7334066740007334, "step": 1000}, {"loss": 1.254, "grad_norm": 0.3454797863960266, "learning_rate": 0.0002, "epoch": 0.7407407407407407, "step": 1010}, {"loss": 1.106, "grad_norm": 0.38999441266059875, "learning_rate": 0.0002, "epoch": 0.7480748074807481, "step": 1020}, {"loss": 1.1428, "grad_norm": 0.2829911708831787, "learning_rate": 0.0002, "epoch": 0.7554088742207554, "step": 1030}, {"loss": 1.2123, "grad_norm": 0.36918163299560547, "learning_rate": 0.0002, "epoch": 0.7627429409607628, "step": 1040}, {"loss": 1.3028, "grad_norm": 0.3415680229663849, "learning_rate": 0.0002, "epoch": 0.77007700770077, "step": 1050}, {"loss": 1.1939, "grad_norm": 0.2974182963371277, "learning_rate": 0.0002, "epoch": 0.7774110744407774, "step": 1060}, {"loss": 1.194, "grad_norm": 0.3880919814109802, "learning_rate": 0.0002, "epoch": 0.7847451411807848, "step": 1070}, {"loss": 1.1095, "grad_norm": 0.33503302931785583, "learning_rate": 0.0002, "epoch": 0.7920792079207921, "step": 1080}, {"loss": 1.2111, "grad_norm": 0.3728407025337219, "learning_rate": 0.0002, "epoch": 0.7994132746607994, "step": 1090}, {"loss": 1.0835, "grad_norm": 0.3509373664855957, "learning_rate": 0.0002, "epoch": 0.8067473414008067, "step": 1100}, {"loss": 1.2661, "grad_norm": 0.42228564620018005, "learning_rate": 0.0002, "epoch": 0.8140814081408141, "step": 1110}, {"loss": 1.1788, "grad_norm": 0.313467800617218, "learning_rate": 0.0002, "epoch": 0.8214154748808215, "step": 1120}, {"loss": 1.1971, "grad_norm": 0.3378850817680359, "learning_rate": 0.0002, "epoch": 0.8287495416208287, "step": 1130}, {"loss": 1.1238, "grad_norm": 0.43200382590293884, "learning_rate": 0.0002, "epoch": 0.8360836083608361, "step": 1140}, {"loss": 1.3203, "grad_norm": 0.3309599459171295, "learning_rate": 0.0002, "epoch": 0.8434176751008434, "step": 1150}, {"loss": 1.1062, "grad_norm": 0.3526846170425415, "learning_rate": 0.0002, "epoch": 0.8507517418408508, "step": 1160}, {"loss": 1.0851, "grad_norm": 1.2722247838974, "learning_rate": 0.0002, "epoch": 0.858085808580858, "step": 1170}, {"loss": 1.0785, "grad_norm": 0.34142059087753296, "learning_rate": 0.0002, "epoch": 0.8654198753208654, "step": 1180}, {"loss": 1.2187, "grad_norm": 0.3805823028087616, "learning_rate": 0.0002, "epoch": 0.8727539420608728, "step": 1190}, {"loss": 1.1215, "grad_norm": 0.3931232690811157, "learning_rate": 0.0002, "epoch": 0.8800880088008801, "step": 1200}, {"loss": 1.0948, "grad_norm": 0.2937372624874115, "learning_rate": 0.0002, "epoch": 0.8874220755408874, "step": 1210}, {"loss": 1.1228, "grad_norm": 0.3757196366786957, "learning_rate": 0.0002, "epoch": 0.8947561422808947, "step": 1220}, {"loss": 1.1222, "grad_norm": 0.3502705991268158, "learning_rate": 0.0002, "epoch": 0.9020902090209021, "step": 1230}, {"loss": 1.2242, "grad_norm": 0.32758915424346924, "learning_rate": 0.0002, "epoch": 0.9094242757609095, "step": 1240}, {"loss": 1.215, "grad_norm": 0.37199416756629944, "learning_rate": 0.0002, "epoch": 0.9167583425009168, "step": 1250}, {"loss": 1.1225, "grad_norm": 0.3551490604877472, "learning_rate": 0.0002, "epoch": 0.9240924092409241, "step": 1260}, {"loss": 1.1966, "grad_norm": 0.2859550714492798, "learning_rate": 0.0002, "epoch": 0.9314264759809314, "step": 1270}, {"loss": 1.2186, "grad_norm": 0.427990585565567, "learning_rate": 0.0002, "epoch": 0.9387605427209388, "step": 1280}, {"loss": 1.2848, "grad_norm": 0.33717992901802063, "learning_rate": 0.0002, "epoch": 0.9460946094609461, "step": 1290}, {"loss": 1.1656, "grad_norm": 0.30225634574890137, "learning_rate": 0.0002, "epoch": 0.9534286762009534, "step": 1300}, {"loss": 1.2404, "grad_norm": 0.385821133852005, "learning_rate": 0.0002, "epoch": 0.9607627429409608, "step": 1310}, {"loss": 1.1932, "grad_norm": 0.35278066992759705, "learning_rate": 0.0002, "epoch": 0.9680968096809681, "step": 1320}, {"loss": 1.1071, "grad_norm": 0.49987098574638367, "learning_rate": 0.0002, "epoch": 0.9754308764209755, "step": 1330}, {"loss": 1.2259, "grad_norm": 0.3842747211456299, "learning_rate": 0.0002, "epoch": 0.9827649431609827, "step": 1340}, {"loss": 1.0862, "grad_norm": 0.6274653673171997, "learning_rate": 0.0002, "epoch": 0.9900990099009901, "step": 1350}, {"loss": 1.124, "grad_norm": 0.5239808559417725, "learning_rate": 0.0002, "epoch": 0.9974330766409975, "step": 1360}, {"eval_loss": 1.1822267770767212, "eval_runtime": 32.7389, "eval_samples_per_second": 13.165, "eval_steps_per_second": 1.649, "epoch": 0.9996332966629996, "step": 1363}, {"loss": 1.096, "grad_norm": 0.45311301946640015, "learning_rate": 0.0002, "epoch": 1.0047671433810048, "step": 1370}, {"loss": 1.0143, "grad_norm": 0.29685574769973755, "learning_rate": 0.0002, "epoch": 1.012101210121012, "step": 1380}, {"loss": 1.0302, "grad_norm": 0.3290937840938568, "learning_rate": 0.0002, "epoch": 1.0194352768610195, "step": 1390}, {"loss": 1.0295, "grad_norm": 0.3801758587360382, "learning_rate": 0.0002, "epoch": 1.0267693436010268, "step": 1400}, {"loss": 1.1226, "grad_norm": 0.794174313545227, "learning_rate": 0.0002, "epoch": 1.034103410341034, "step": 1410}, {"loss": 1.2232, "grad_norm": 0.3854154646396637, "learning_rate": 0.0002, "epoch": 1.0414374770810415, "step": 1420}, {"loss": 1.0652, "grad_norm": 0.32702451944351196, "learning_rate": 0.0002, "epoch": 1.0487715438210488, "step": 1430}, {"loss": 1.1144, "grad_norm": 0.7815203666687012, "learning_rate": 0.0002, "epoch": 1.056105610561056, "step": 1440}, {"loss": 1.1316, "grad_norm": 0.3087436854839325, "learning_rate": 0.0002, "epoch": 1.0634396773010635, "step": 1450}, {"loss": 1.1124, "grad_norm": 0.3847602903842926, "learning_rate": 0.0002, "epoch": 1.0707737440410707, "step": 1460}, {"loss": 1.1428, "grad_norm": 0.3693031370639801, "learning_rate": 0.0002, "epoch": 1.0781078107810782, "step": 1470}, {"loss": 1.0995, "grad_norm": 0.4111202359199524, "learning_rate": 0.0002, "epoch": 1.0854418775210855, "step": 1480}, {"loss": 1.0961, "grad_norm": 0.41452381014823914, "learning_rate": 0.0002, "epoch": 1.0927759442610927, "step": 1490}, {"loss": 1.1068, "grad_norm": 0.3336445093154907, "learning_rate": 0.0002, "epoch": 1.1001100110011002, "step": 1500}, {"loss": 1.0556, "grad_norm": 0.3923407793045044, "learning_rate": 0.0002, "epoch": 1.1074440777411074, "step": 1510}, {"loss": 1.1644, "grad_norm": 0.46215683221817017, "learning_rate": 0.0002, "epoch": 1.1147781444811147, "step": 1520}, {"loss": 1.1133, "grad_norm": 0.3592156767845154, "learning_rate": 0.0002, "epoch": 1.1221122112211221, "step": 1530}, {"loss": 1.0957, "grad_norm": 0.361110657453537, "learning_rate": 0.0002, "epoch": 1.1294462779611294, "step": 1540}, {"loss": 1.1553, "grad_norm": 0.5317131280899048, "learning_rate": 0.0002, "epoch": 1.1367803447011369, "step": 1550}, {"loss": 1.0368, "grad_norm": 0.3882388174533844, "learning_rate": 0.0002, "epoch": 1.1441144114411441, "step": 1560}, {"loss": 1.0805, "grad_norm": 0.3259428143501282, "learning_rate": 0.0002, "epoch": 1.1514484781811514, "step": 1570}, {"loss": 1.1819, "grad_norm": 0.410935640335083, "learning_rate": 0.0002, "epoch": 1.1587825449211588, "step": 1580}, {"loss": 1.1143, "grad_norm": 0.44940185546875, "learning_rate": 0.0002, "epoch": 1.166116611661166, "step": 1590}, {"loss": 1.0334, "grad_norm": 0.5106484293937683, "learning_rate": 0.0002, "epoch": 1.1734506784011733, "step": 1600}, {"loss": 1.2376, "grad_norm": 0.6603665947914124, "learning_rate": 0.0002, "epoch": 1.1807847451411808, "step": 1610}, {"loss": 1.1227, "grad_norm": 0.4799964129924774, "learning_rate": 0.0002, "epoch": 1.188118811881188, "step": 1620}, {"loss": 1.1191, "grad_norm": 0.4389883279800415, "learning_rate": 0.0002, "epoch": 1.1954528786211955, "step": 1630}, {"loss": 1.0667, "grad_norm": 0.4188813269138336, "learning_rate": 0.0002, "epoch": 1.2027869453612028, "step": 1640}, {"loss": 1.0605, "grad_norm": 0.7132157683372498, "learning_rate": 0.0002, "epoch": 1.21012101210121, "step": 1650}, {"loss": 1.0204, "grad_norm": 0.507480263710022, "learning_rate": 0.0002, "epoch": 1.2174550788412175, "step": 1660}, {"loss": 0.9948, "grad_norm": 0.9452332854270935, "learning_rate": 0.0002, "epoch": 1.2247891455812248, "step": 1670}, {"loss": 1.0228, "grad_norm": 0.4121614992618561, "learning_rate": 0.0002, "epoch": 1.2321232123212322, "step": 1680}, {"loss": 1.0366, "grad_norm": 0.34230247139930725, "learning_rate": 0.0002, "epoch": 1.2394572790612395, "step": 1690}, {"loss": 1.1289, "grad_norm": 0.4026208817958832, "learning_rate": 0.0002, "epoch": 1.2467913458012467, "step": 1700}, {"loss": 1.0206, "grad_norm": 0.46673697233200073, "learning_rate": 0.0002, "epoch": 1.2541254125412542, "step": 1710}, {"loss": 1.0827, "grad_norm": 0.38349825143814087, "learning_rate": 0.0002, "epoch": 1.2614594792812615, "step": 1720}, {"loss": 1.0356, "grad_norm": 0.4049997627735138, "learning_rate": 0.0002, "epoch": 1.2687935460212687, "step": 1730}, {"loss": 0.9504, "grad_norm": 0.3417615294456482, "learning_rate": 0.0002, "epoch": 1.2761276127612762, "step": 1740}, {"loss": 1.094, "grad_norm": 0.4277614951133728, "learning_rate": 0.0002, "epoch": 1.2834616795012834, "step": 1750}, {"loss": 0.9938, "grad_norm": 0.5864202976226807, "learning_rate": 0.0002, "epoch": 1.2907957462412907, "step": 1760}, {"loss": 1.1167, "grad_norm": 0.7097493410110474, "learning_rate": 0.0002, "epoch": 1.2981298129812981, "step": 1770}, {"loss": 1.1132, "grad_norm": 0.3145381212234497, "learning_rate": 0.0002, "epoch": 1.3054638797213054, "step": 1780}, {"loss": 1.1099, "grad_norm": 0.5116165280342102, "learning_rate": 0.0002, "epoch": 1.3127979464613129, "step": 1790}, {"loss": 1.0765, "grad_norm": 0.7469736337661743, "learning_rate": 0.0002, "epoch": 1.3201320132013201, "step": 1800}, {"loss": 1.0663, "grad_norm": 0.32272255420684814, "learning_rate": 0.0002, "epoch": 1.3274660799413276, "step": 1810}, {"loss": 0.9887, "grad_norm": 0.3534623086452484, "learning_rate": 0.0002, "epoch": 1.3348001466813348, "step": 1820}, {"loss": 1.1628, "grad_norm": 0.36127907037734985, "learning_rate": 0.0002, "epoch": 1.342134213421342, "step": 1830}, {"loss": 1.0972, "grad_norm": 0.4072401523590088, "learning_rate": 0.0002, "epoch": 1.3494682801613496, "step": 1840}, {"loss": 1.1267, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.3568023469013568, "step": 1850}, {"loss": 1.0173, "grad_norm": 0.412883460521698, "learning_rate": 0.0002, "epoch": 1.364136413641364, "step": 1860}, {"loss": 1.0265, "grad_norm": 0.3735875189304352, "learning_rate": 0.0002, "epoch": 1.3714704803813715, "step": 1870}, {"loss": 1.1061, "grad_norm": 0.39158159494400024, "learning_rate": 0.0002, "epoch": 1.3788045471213788, "step": 1880}, {"loss": 1.0433, "grad_norm": 0.44431769847869873, "learning_rate": 0.0002, "epoch": 1.386138613861386, "step": 1890}, {"loss": 1.0216, "grad_norm": 0.37772801518440247, "learning_rate": 0.0002, "epoch": 1.3934726806013935, "step": 1900}, {"loss": 1.0674, "grad_norm": 0.4056641757488251, "learning_rate": 0.0002, "epoch": 1.4008067473414008, "step": 1910}, {"loss": 1.0256, "grad_norm": 0.41612377762794495, "learning_rate": 0.0002, "epoch": 1.408140814081408, "step": 1920}, {"loss": 1.0467, "grad_norm": 0.41153013706207275, "learning_rate": 0.0002, "epoch": 1.4154748808214155, "step": 1930}, {"loss": 1.1062, "grad_norm": 0.387845516204834, "learning_rate": 0.0002, "epoch": 1.4228089475614227, "step": 1940}, {"loss": 1.1094, "grad_norm": 0.3809587061405182, "learning_rate": 0.0002, "epoch": 1.4301430143014302, "step": 1950}, {"loss": 1.0461, "grad_norm": 0.3625726103782654, "learning_rate": 0.0002, "epoch": 1.4374770810414375, "step": 1960}, {"loss": 0.9983, "grad_norm": 0.5294290781021118, "learning_rate": 0.0002, "epoch": 1.444811147781445, "step": 1970}, {"loss": 1.1114, "grad_norm": 0.39975494146347046, "learning_rate": 0.0002, "epoch": 1.4521452145214522, "step": 1980}, {"loss": 0.9704, "grad_norm": 0.4181167185306549, "learning_rate": 0.0002, "epoch": 1.4594792812614594, "step": 1990}, {"loss": 1.1146, "grad_norm": 0.42001503705978394, "learning_rate": 0.0002, "epoch": 1.466813348001467, "step": 2000}, {"loss": 1.1266, "grad_norm": 0.4877578616142273, "learning_rate": 0.0002, "epoch": 1.4741474147414741, "step": 2010}, {"loss": 1.1012, "grad_norm": 0.4050969183444977, "learning_rate": 0.0002, "epoch": 1.4814814814814814, "step": 2020}, {"loss": 1.0562, "grad_norm": 0.39068883657455444, "learning_rate": 0.0002, "epoch": 1.4888155482214889, "step": 2030}, {"loss": 1.0464, "grad_norm": 0.421282559633255, "learning_rate": 0.0002, "epoch": 1.4961496149614961, "step": 2040}, {"loss": 1.0532, "grad_norm": 0.47092297673225403, "learning_rate": 0.0002, "epoch": 1.5034836817015034, "step": 2050}, {"loss": 0.9348, "grad_norm": 0.39688974618911743, "learning_rate": 0.0002, "epoch": 1.5108177484415108, "step": 2060}, {"loss": 1.08, "grad_norm": 0.5529879331588745, "learning_rate": 0.0002, "epoch": 1.5181518151815183, "step": 2070}, {"loss": 1.1836, "grad_norm": 0.4879782199859619, "learning_rate": 0.0002, "epoch": 1.5254858819215253, "step": 2080}, {"loss": 1.0432, "grad_norm": 0.5517361164093018, "learning_rate": 0.0002, "epoch": 1.5328199486615328, "step": 2090}, {"loss": 1.0433, "grad_norm": 0.44015637040138245, "learning_rate": 0.0002, "epoch": 1.5401540154015403, "step": 2100}, {"loss": 1.1873, "grad_norm": 0.5435167551040649, "learning_rate": 0.0002, "epoch": 1.5474880821415475, "step": 2110}, {"loss": 1.1076, "grad_norm": 0.5714033246040344, "learning_rate": 0.0002, "epoch": 1.5548221488815548, "step": 2120}, {"loss": 1.1107, "grad_norm": 0.31732529401779175, "learning_rate": 0.0002, "epoch": 1.5621562156215623, "step": 2130}, {"loss": 1.0817, "grad_norm": 0.49068278074264526, "learning_rate": 0.0002, "epoch": 1.5694902823615695, "step": 2140}, {"loss": 1.0254, "grad_norm": 0.46851542592048645, "learning_rate": 0.0002, "epoch": 1.5768243491015768, "step": 2150}, {"loss": 1.0623, "grad_norm": 0.5083092451095581, "learning_rate": 0.0002, "epoch": 1.5841584158415842, "step": 2160}, {"loss": 1.0603, "grad_norm": 0.9822936058044434, "learning_rate": 0.0002, "epoch": 1.5914924825815915, "step": 2170}, {"loss": 0.9986, "grad_norm": 0.4575989246368408, "learning_rate": 0.0002, "epoch": 1.5988265493215987, "step": 2180}, {"loss": 1.1292, "grad_norm": 0.47444286942481995, "learning_rate": 0.0002, "epoch": 1.6061606160616062, "step": 2190}, {"loss": 1.0136, "grad_norm": 0.7208226919174194, "learning_rate": 0.0002, "epoch": 1.6134946828016135, "step": 2200}, {"loss": 1.15, "grad_norm": 0.43791481852531433, "learning_rate": 0.0002, "epoch": 1.6208287495416207, "step": 2210}, {"loss": 1.0961, "grad_norm": 0.5245792865753174, "learning_rate": 0.0002, "epoch": 1.6281628162816282, "step": 2220}, {"loss": 0.9957, "grad_norm": 0.39289429783821106, "learning_rate": 0.0002, "epoch": 1.6354968830216357, "step": 2230}, {"loss": 1.133, "grad_norm": 0.6106135845184326, "learning_rate": 0.0002, "epoch": 1.6428309497616427, "step": 2240}, {"loss": 1.0129, "grad_norm": 0.3722580671310425, "learning_rate": 0.0002, "epoch": 1.6501650165016502, "step": 2250}, {"loss": 1.0446, "grad_norm": 0.3649403750896454, "learning_rate": 0.0002, "epoch": 1.6574990832416576, "step": 2260}, {"loss": 1.0037, "grad_norm": 0.46514248847961426, "learning_rate": 0.0002, "epoch": 1.6648331499816649, "step": 2270}, {"loss": 1.0022, "grad_norm": 0.42034927010536194, "learning_rate": 0.0002, "epoch": 1.6721672167216721, "step": 2280}, {"loss": 1.1362, "grad_norm": 0.45202910900115967, "learning_rate": 0.0002, "epoch": 1.6795012834616796, "step": 2290}, {"loss": 1.0866, "grad_norm": 0.36257603764533997, "learning_rate": 0.0002, "epoch": 1.6868353502016868, "step": 2300}, {"loss": 1.0973, "grad_norm": 0.6340323090553284, "learning_rate": 0.0002, "epoch": 1.694169416941694, "step": 2310}, {"loss": 1.0615, "grad_norm": 0.4352878928184509, "learning_rate": 0.0002, "epoch": 1.7015034836817016, "step": 2320}, {"loss": 1.0629, "grad_norm": 0.45029792189598083, "learning_rate": 0.0002, "epoch": 1.7088375504217088, "step": 2330}, {"loss": 0.9621, "grad_norm": 0.3891315758228302, "learning_rate": 0.0002, "epoch": 1.716171617161716, "step": 2340}, {"loss": 0.9779, "grad_norm": 0.35180050134658813, "learning_rate": 0.0002, "epoch": 1.7235056839017235, "step": 2350}, {"loss": 1.0368, "grad_norm": 0.42367449402809143, "learning_rate": 0.0002, "epoch": 1.7308397506417308, "step": 2360}, {"loss": 1.0376, "grad_norm": 0.4553675353527069, "learning_rate": 0.0002, "epoch": 1.738173817381738, "step": 2370}, {"loss": 1.1467, "grad_norm": 0.5944654941558838, "learning_rate": 0.0002, "epoch": 1.7455078841217455, "step": 2380}, {"loss": 1.0548, "grad_norm": 0.3479664623737335, "learning_rate": 0.0002, "epoch": 1.752841950861753, "step": 2390}, {"loss": 1.0798, "grad_norm": 0.3585502505302429, "learning_rate": 0.0002, "epoch": 1.76017601760176, "step": 2400}, {"loss": 1.0983, "grad_norm": 0.4263346493244171, "learning_rate": 0.0002, "epoch": 1.7675100843417675, "step": 2410}, {"loss": 1.054, "grad_norm": 0.5476409196853638, "learning_rate": 0.0002, "epoch": 1.774844151081775, "step": 2420}, {"loss": 1.1615, "grad_norm": 0.3694186508655548, "learning_rate": 0.0002, "epoch": 1.7821782178217822, "step": 2430}, {"loss": 1.1343, "grad_norm": 0.9185658693313599, "learning_rate": 0.0002, "epoch": 1.7895122845617895, "step": 2440}, {"loss": 1.0764, "grad_norm": 0.7171908020973206, "learning_rate": 0.0002, "epoch": 1.796846351301797, "step": 2450}, {"loss": 1.1154, "grad_norm": 0.550658643245697, "learning_rate": 0.0002, "epoch": 1.8041804180418042, "step": 2460}, {"loss": 0.9975, "grad_norm": 0.4075568914413452, "learning_rate": 0.0002, "epoch": 1.8115144847818114, "step": 2470}, {"loss": 1.0935, "grad_norm": 0.3790127635002136, "learning_rate": 0.0002, "epoch": 1.818848551521819, "step": 2480}, {"loss": 0.9839, "grad_norm": 0.3576384484767914, "learning_rate": 0.0002, "epoch": 1.8261826182618262, "step": 2490}, {"loss": 1.1369, "grad_norm": 0.3919370770454407, "learning_rate": 0.0002, "epoch": 1.8335166850018334, "step": 2500}, {"loss": 0.9985, "grad_norm": 0.485083669424057, "learning_rate": 0.0002, "epoch": 1.8408507517418409, "step": 2510}, {"loss": 1.1585, "grad_norm": 0.4564347565174103, "learning_rate": 0.0002, "epoch": 1.8481848184818483, "step": 2520}, {"loss": 1.0944, "grad_norm": 0.3613106608390808, "learning_rate": 0.0002, "epoch": 1.8555188852218554, "step": 2530}, {"loss": 1.0819, "grad_norm": 0.39600759744644165, "learning_rate": 0.0002, "epoch": 1.8628529519618628, "step": 2540}, {"loss": 0.9453, "grad_norm": 1.123499870300293, "learning_rate": 0.0002, "epoch": 1.8701870187018703, "step": 2550}, {"loss": 1.0635, "grad_norm": 0.4612680673599243, "learning_rate": 0.0002, "epoch": 1.8775210854418776, "step": 2560}, {"loss": 1.0087, "grad_norm": 0.42745399475097656, "learning_rate": 0.0002, "epoch": 1.8848551521818848, "step": 2570}, {"loss": 1.0102, "grad_norm": 0.4055580198764801, "learning_rate": 0.0002, "epoch": 1.8921892189218923, "step": 2580}, {"loss": 1.0177, "grad_norm": 0.44174644351005554, "learning_rate": 0.0002, "epoch": 1.8995232856618995, "step": 2590}, {"loss": 0.9886, "grad_norm": 1.0228385925292969, "learning_rate": 0.0002, "epoch": 1.9068573524019068, "step": 2600}, {"loss": 1.0857, "grad_norm": 0.3496396243572235, "learning_rate": 0.0002, "epoch": 1.9141914191419143, "step": 2610}, {"loss": 1.0955, "grad_norm": 0.4191173017024994, "learning_rate": 0.0002, "epoch": 1.9215254858819215, "step": 2620}, {"loss": 1.0943, "grad_norm": 0.6778554916381836, "learning_rate": 0.0002, "epoch": 1.9288595526219288, "step": 2630}, {"loss": 1.0594, "grad_norm": 0.41992834210395813, "learning_rate": 0.0002, "epoch": 1.9361936193619362, "step": 2640}, {"loss": 1.1159, "grad_norm": 0.8760401010513306, "learning_rate": 0.0002, "epoch": 1.9435276861019435, "step": 2650}, {"loss": 1.0379, "grad_norm": 0.44049209356307983, "learning_rate": 0.0002, "epoch": 1.9508617528419507, "step": 2660}, {"loss": 1.1008, "grad_norm": 0.5651928782463074, "learning_rate": 0.0002, "epoch": 1.9581958195819582, "step": 2670}, {"loss": 1.1317, "grad_norm": 0.5292727947235107, "learning_rate": 0.0002, "epoch": 1.9655298863219657, "step": 2680}, {"loss": 1.1328, "grad_norm": 0.6012240648269653, "learning_rate": 0.0002, "epoch": 1.9728639530619727, "step": 2690}, {"loss": 1.0683, "grad_norm": 0.3945149779319763, "learning_rate": 0.0002, "epoch": 1.9801980198019802, "step": 2700}, {"loss": 1.0155, "grad_norm": 0.5732627511024475, "learning_rate": 0.0002, "epoch": 1.9875320865419877, "step": 2710}, {"loss": 0.9857, "grad_norm": 0.3963361084461212, "learning_rate": 0.0002, "epoch": 1.994866153281995, "step": 2720}, {"eval_loss": 1.1534006595611572, "eval_runtime": 32.7541, "eval_samples_per_second": 13.159, "eval_steps_per_second": 1.649, "epoch": 2.0, "step": 2727}, {"loss": 0.9624, "grad_norm": 0.48628315329551697, "learning_rate": 0.0002, "epoch": 2.002200220022002, "step": 2730}, {"loss": 0.9603, "grad_norm": 0.413875013589859, "learning_rate": 0.0002, "epoch": 2.0095342867620096, "step": 2740}, {"loss": 0.965, "grad_norm": 0.4988735616207123, "learning_rate": 0.0002, "epoch": 2.0168683535020167, "step": 2750}, {"loss": 0.9677, "grad_norm": 0.5634812712669373, "learning_rate": 0.0002, "epoch": 2.024202420242024, "step": 2760}, {"loss": 0.9547, "grad_norm": 0.48302653431892395, "learning_rate": 0.0002, "epoch": 2.0315364869820316, "step": 2770}, {"loss": 0.9346, "grad_norm": 0.49914175271987915, "learning_rate": 0.0002, "epoch": 2.038870553722039, "step": 2780}, {"loss": 0.904, "grad_norm": 1.14039945602417, "learning_rate": 0.0002, "epoch": 2.046204620462046, "step": 2790}, {"loss": 0.9588, "grad_norm": 0.6359720826148987, "learning_rate": 0.0002, "epoch": 2.0535386872020536, "step": 2800}, {"loss": 0.9031, "grad_norm": 0.4589158296585083, "learning_rate": 0.0002, "epoch": 2.060872753942061, "step": 2810}, {"loss": 0.9438, "grad_norm": 0.46255481243133545, "learning_rate": 0.0002, "epoch": 2.068206820682068, "step": 2820}, {"loss": 0.9464, "grad_norm": 0.6232137680053711, "learning_rate": 0.0002, "epoch": 2.0755408874220755, "step": 2830}, {"loss": 0.8978, "grad_norm": 0.41042178869247437, "learning_rate": 0.0002, "epoch": 2.082874954162083, "step": 2840}, {"loss": 0.8516, "grad_norm": 0.5334428548812866, "learning_rate": 0.0002, "epoch": 2.09020902090209, "step": 2850}, {"loss": 0.9313, "grad_norm": 0.8270058631896973, "learning_rate": 0.0002, "epoch": 2.0975430876420975, "step": 2860}, {"loss": 1.0064, "grad_norm": 0.6624533534049988, "learning_rate": 0.0002, "epoch": 2.104877154382105, "step": 2870}, {"loss": 0.9196, "grad_norm": 0.5448863506317139, "learning_rate": 0.0002, "epoch": 2.112211221122112, "step": 2880}, {"loss": 0.887, "grad_norm": 0.621482789516449, "learning_rate": 0.0002, "epoch": 2.1195452878621195, "step": 2890}, {"loss": 0.9702, "grad_norm": 0.4556255340576172, "learning_rate": 0.0002, "epoch": 2.126879354602127, "step": 2900}, {"loss": 0.9323, "grad_norm": 0.4620579183101654, "learning_rate": 0.0002, "epoch": 2.1342134213421344, "step": 2910}, {"loss": 0.836, "grad_norm": 0.9602415561676025, "learning_rate": 0.0002, "epoch": 2.1415474880821415, "step": 2920}, {"loss": 0.8826, "grad_norm": 0.587943971157074, "learning_rate": 0.0002, "epoch": 2.148881554822149, "step": 2930}, {"loss": 0.971, "grad_norm": 0.5121372938156128, "learning_rate": 0.0002, "epoch": 2.1562156215621564, "step": 2940}, {"loss": 0.8751, "grad_norm": 0.49424484372138977, "learning_rate": 0.0002, "epoch": 2.1635496883021634, "step": 2950}, {"loss": 0.8674, "grad_norm": 0.6312560439109802, "learning_rate": 0.0002, "epoch": 2.170883755042171, "step": 2960}, {"loss": 0.9791, "grad_norm": 0.5235576629638672, "learning_rate": 0.0002, "epoch": 2.1782178217821784, "step": 2970}, {"loss": 0.9706, "grad_norm": 0.5868439674377441, "learning_rate": 0.0002, "epoch": 2.1855518885221854, "step": 2980}, {"loss": 0.9338, "grad_norm": 0.42302873730659485, "learning_rate": 0.0002, "epoch": 2.192885955262193, "step": 2990}, {"loss": 0.9332, "grad_norm": 0.5097725987434387, "learning_rate": 0.0002, "epoch": 2.2002200220022003, "step": 3000}, {"loss": 0.9239, "grad_norm": 0.5091572403907776, "learning_rate": 0.0002, "epoch": 2.2075540887422074, "step": 3010}, {"loss": 0.8898, "grad_norm": 0.49433162808418274, "learning_rate": 0.0002, "epoch": 2.214888155482215, "step": 3020}, {"loss": 0.9734, "grad_norm": 0.5577368140220642, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3030}, {"loss": 0.9033, "grad_norm": 0.6177583932876587, "learning_rate": 0.0002, "epoch": 2.2295562889622293, "step": 3040}, {"loss": 0.9882, "grad_norm": 0.5256719589233398, "learning_rate": 0.0002, "epoch": 2.236890355702237, "step": 3050}, {"loss": 0.9439, "grad_norm": 0.5001118183135986, "learning_rate": 0.0002, "epoch": 2.2442244224422443, "step": 3060}, {"loss": 0.8718, "grad_norm": 0.5721249580383301, "learning_rate": 0.0002, "epoch": 2.2515584891822513, "step": 3070}, {"loss": 1.0648, "grad_norm": 0.5325384140014648, "learning_rate": 0.0002, "epoch": 2.258892555922259, "step": 3080}, {"loss": 0.9843, "grad_norm": 0.5719189047813416, "learning_rate": 0.0002, "epoch": 2.2662266226622663, "step": 3090}, {"loss": 0.8633, "grad_norm": 0.6337835788726807, "learning_rate": 0.0002, "epoch": 2.2735606894022737, "step": 3100}, {"loss": 0.9962, "grad_norm": 0.5381836891174316, "learning_rate": 0.0002, "epoch": 2.2808947561422808, "step": 3110}, {"loss": 0.8265, "grad_norm": 0.5408531427383423, "learning_rate": 0.0002, "epoch": 2.2882288228822882, "step": 3120}, {"loss": 1.0325, "grad_norm": 0.43705281615257263, "learning_rate": 0.0002, "epoch": 2.2955628896222957, "step": 3130}, {"loss": 0.9388, "grad_norm": 0.6454030275344849, "learning_rate": 0.0002, "epoch": 2.3028969563623027, "step": 3140}, {"loss": 0.954, "grad_norm": 0.686030387878418, "learning_rate": 0.0002, "epoch": 2.31023102310231, "step": 3150}, {"loss": 0.9403, "grad_norm": 0.5123633146286011, "learning_rate": 0.0002, "epoch": 2.3175650898423177, "step": 3160}, {"loss": 0.8834, "grad_norm": 0.842506468296051, "learning_rate": 0.0002, "epoch": 2.3248991565823247, "step": 3170}, {"loss": 1.0497, "grad_norm": 0.5193818807601929, "learning_rate": 0.0002, "epoch": 2.332233223322332, "step": 3180}, {"loss": 0.9473, "grad_norm": 0.5634409189224243, "learning_rate": 0.0002, "epoch": 2.3395672900623397, "step": 3190}, {"loss": 0.8499, "grad_norm": 0.6475534439086914, "learning_rate": 0.0002, "epoch": 2.3469013568023467, "step": 3200}, {"loss": 0.874, "grad_norm": 1.1503914594650269, "learning_rate": 0.0002, "epoch": 2.354235423542354, "step": 3210}, {"loss": 0.9762, "grad_norm": 0.7234905362129211, "learning_rate": 0.0002, "epoch": 2.3615694902823616, "step": 3220}, {"loss": 0.9007, "grad_norm": 0.664903461933136, "learning_rate": 0.0002, "epoch": 2.368903557022369, "step": 3230}, {"loss": 0.9987, "grad_norm": 0.5453006625175476, "learning_rate": 0.0002, "epoch": 2.376237623762376, "step": 3240}, {"loss": 0.9742, "grad_norm": 0.6256654262542725, "learning_rate": 0.0002, "epoch": 2.3835716905023836, "step": 3250}, {"loss": 0.9922, "grad_norm": 0.5166565179824829, "learning_rate": 0.0002, "epoch": 2.390905757242391, "step": 3260}, {"loss": 0.927, "grad_norm": 0.5699098110198975, "learning_rate": 0.0002, "epoch": 2.398239823982398, "step": 3270}, {"loss": 0.8878, "grad_norm": 0.4472540020942688, "learning_rate": 0.0002, "epoch": 2.4055738907224056, "step": 3280}, {"loss": 0.9439, "grad_norm": 0.6790403127670288, "learning_rate": 0.0002, "epoch": 2.412907957462413, "step": 3290}, {"loss": 0.972, "grad_norm": 0.5182185173034668, "learning_rate": 0.0002, "epoch": 2.42024202420242, "step": 3300}, {"loss": 0.9775, "grad_norm": 0.564647912979126, "learning_rate": 0.0002, "epoch": 2.4275760909424275, "step": 3310}, {"loss": 1.072, "grad_norm": 0.5625313520431519, "learning_rate": 0.0002, "epoch": 2.434910157682435, "step": 3320}, {"loss": 0.8798, "grad_norm": 0.7496559619903564, "learning_rate": 0.0002, "epoch": 2.442244224422442, "step": 3330}, {"loss": 0.868, "grad_norm": 0.4779128134250641, "learning_rate": 0.0002, "epoch": 2.4495782911624495, "step": 3340}, {"loss": 1.0316, "grad_norm": 0.578093409538269, "learning_rate": 0.0002, "epoch": 2.456912357902457, "step": 3350}, {"loss": 0.9282, "grad_norm": 0.5456080436706543, "learning_rate": 0.0002, "epoch": 2.4642464246424645, "step": 3360}, {"loss": 0.8409, "grad_norm": 0.4769273102283478, "learning_rate": 0.0002, "epoch": 2.4715804913824715, "step": 3370}, {"loss": 0.9312, "grad_norm": 0.5608189702033997, "learning_rate": 0.0002, "epoch": 2.478914558122479, "step": 3380}, {"loss": 0.9934, "grad_norm": 0.5590165853500366, "learning_rate": 0.0002, "epoch": 2.4862486248624864, "step": 3390}, {"loss": 1.025, "grad_norm": 0.801306962966919, "learning_rate": 0.0002, "epoch": 2.4935826916024935, "step": 3400}, {"loss": 0.9049, "grad_norm": 0.6045624613761902, "learning_rate": 0.0002, "epoch": 2.500916758342501, "step": 3410}, {"loss": 0.944, "grad_norm": 0.5735858082771301, "learning_rate": 0.0002, "epoch": 2.5082508250825084, "step": 3420}, {"loss": 0.9846, "grad_norm": 0.6827309131622314, "learning_rate": 0.0002, "epoch": 2.5155848918225154, "step": 3430}, {"loss": 0.9789, "grad_norm": 0.5702602863311768, "learning_rate": 0.0002, "epoch": 2.522918958562523, "step": 3440}, {"loss": 0.9127, "grad_norm": 0.6674721240997314, "learning_rate": 0.0002, "epoch": 2.5302530253025304, "step": 3450}, {"loss": 0.914, "grad_norm": 0.5635907649993896, "learning_rate": 0.0002, "epoch": 2.5375870920425374, "step": 3460}, {"loss": 0.8398, "grad_norm": 0.42737770080566406, "learning_rate": 0.0002, "epoch": 2.544921158782545, "step": 3470}, {"loss": 0.9474, "grad_norm": 0.6720691919326782, "learning_rate": 0.0002, "epoch": 2.5522552255225524, "step": 3480}, {"loss": 0.8637, "grad_norm": 0.8917084336280823, "learning_rate": 0.0002, "epoch": 2.55958929226256, "step": 3490}, {"loss": 0.9257, "grad_norm": 0.5134549140930176, "learning_rate": 0.0002, "epoch": 2.566923359002567, "step": 3500}, {"loss": 0.9362, "grad_norm": 0.4951367974281311, "learning_rate": 0.0002, "epoch": 2.5742574257425743, "step": 3510}, {"loss": 0.9184, "grad_norm": 0.9438204765319824, "learning_rate": 0.0002, "epoch": 2.5815914924825814, "step": 3520}, {"loss": 0.8939, "grad_norm": 0.6024714708328247, "learning_rate": 0.0002, "epoch": 2.588925559222589, "step": 3530}, {"loss": 0.9298, "grad_norm": 0.5248535871505737, "learning_rate": 0.0002, "epoch": 2.5962596259625963, "step": 3540}, {"loss": 0.941, "grad_norm": 0.8677568435668945, "learning_rate": 0.0002, "epoch": 2.6035936927026038, "step": 3550}, {"loss": 0.9253, "grad_norm": 0.82008296251297, "learning_rate": 0.0002, "epoch": 2.610927759442611, "step": 3560}, {"loss": 0.8429, "grad_norm": 0.4724634885787964, "learning_rate": 0.0002, "epoch": 2.6182618261826183, "step": 3570}, {"loss": 0.9058, "grad_norm": 0.5434244275093079, "learning_rate": 0.0002, "epoch": 2.6255958929226257, "step": 3580}, {"loss": 0.9379, "grad_norm": 0.4948740005493164, "learning_rate": 0.0002, "epoch": 2.6329299596626328, "step": 3590}, {"loss": 0.8718, "grad_norm": 0.42109328508377075, "learning_rate": 0.0002, "epoch": 2.6402640264026402, "step": 3600}, {"loss": 0.9809, "grad_norm": 0.7979786396026611, "learning_rate": 0.0002, "epoch": 2.6475980931426477, "step": 3610}, {"loss": 0.9229, "grad_norm": 0.6345919370651245, "learning_rate": 0.0002, "epoch": 2.654932159882655, "step": 3620}, {"loss": 0.8506, "grad_norm": 0.4971671402454376, "learning_rate": 0.0002, "epoch": 2.662266226622662, "step": 3630}, {"loss": 0.8054, "grad_norm": 0.6467748284339905, "learning_rate": 0.0002, "epoch": 2.6696002933626697, "step": 3640}, {"loss": 0.9277, "grad_norm": 0.4240160286426544, "learning_rate": 0.0002, "epoch": 2.6769343601026767, "step": 3650}, {"loss": 0.8213, "grad_norm": 0.5179754495620728, "learning_rate": 0.0002, "epoch": 2.684268426842684, "step": 3660}, {"loss": 0.9221, "grad_norm": 0.754012405872345, "learning_rate": 0.0002, "epoch": 2.6916024935826917, "step": 3670}, {"loss": 0.9194, "grad_norm": 0.5141299962997437, "learning_rate": 0.0002, "epoch": 2.698936560322699, "step": 3680}, {"loss": 0.9495, "grad_norm": 0.5737819075584412, "learning_rate": 0.0002, "epoch": 2.706270627062706, "step": 3690}, {"loss": 1.0162, "grad_norm": 0.5887577533721924, "learning_rate": 0.0002, "epoch": 2.7136046938027136, "step": 3700}, {"loss": 0.9169, "grad_norm": 0.6740471720695496, "learning_rate": 0.0002, "epoch": 2.720938760542721, "step": 3710}, {"loss": 0.9297, "grad_norm": 0.5879453420639038, "learning_rate": 0.0002, "epoch": 2.728272827282728, "step": 3720}, {"loss": 0.9358, "grad_norm": 0.4858354926109314, "learning_rate": 0.0002, "epoch": 2.7356068940227356, "step": 3730}, {"loss": 0.9308, "grad_norm": 0.5489001870155334, "learning_rate": 0.0002, "epoch": 2.742940960762743, "step": 3740}, {"loss": 0.894, "grad_norm": 0.8187092542648315, "learning_rate": 0.0002, "epoch": 2.7502750275027505, "step": 3750}, {"loss": 0.8954, "grad_norm": 0.5666626691818237, "learning_rate": 0.0002, "epoch": 2.7576090942427576, "step": 3760}, {"loss": 1.0059, "grad_norm": 0.5377066135406494, "learning_rate": 0.0002, "epoch": 2.764943160982765, "step": 3770}, {"loss": 0.9132, "grad_norm": 0.566330075263977, "learning_rate": 0.0002, "epoch": 2.772277227722772, "step": 3780}, {"loss": 0.9415, "grad_norm": 0.5522832870483398, "learning_rate": 0.0002, "epoch": 2.7796112944627795, "step": 3790}, {"loss": 0.8816, "grad_norm": 0.5668695569038391, "learning_rate": 0.0002, "epoch": 2.786945361202787, "step": 3800}, {"loss": 0.8885, "grad_norm": 0.7566602826118469, "learning_rate": 0.0002, "epoch": 2.7942794279427945, "step": 3810}, {"loss": 0.8598, "grad_norm": 0.5603684782981873, "learning_rate": 0.0002, "epoch": 2.8016134946828015, "step": 3820}, {"loss": 0.9602, "grad_norm": 0.49122217297554016, "learning_rate": 0.0002, "epoch": 2.808947561422809, "step": 3830}, {"loss": 0.9738, "grad_norm": 0.6798251867294312, "learning_rate": 0.0002, "epoch": 2.816281628162816, "step": 3840}, {"loss": 0.9533, "grad_norm": 0.6097991466522217, "learning_rate": 0.0002, "epoch": 2.8236156949028235, "step": 3850}, {"loss": 0.8672, "grad_norm": 0.6675726175308228, "learning_rate": 0.0002, "epoch": 2.830949761642831, "step": 3860}, {"loss": 0.9324, "grad_norm": 0.9223952889442444, "learning_rate": 0.0002, "epoch": 2.8382838283828384, "step": 3870}, {"loss": 0.8767, "grad_norm": 0.6020799875259399, "learning_rate": 0.0002, "epoch": 2.8456178951228455, "step": 3880}, {"loss": 0.9148, "grad_norm": 0.5206381678581238, "learning_rate": 0.0002, "epoch": 2.852951961862853, "step": 3890}, {"loss": 0.9479, "grad_norm": 0.6268777251243591, "learning_rate": 0.0002, "epoch": 2.8602860286028604, "step": 3900}, {"loss": 0.9409, "grad_norm": 1.1583497524261475, "learning_rate": 0.0002, "epoch": 2.8676200953428674, "step": 3910}, {"loss": 0.895, "grad_norm": 0.7263903021812439, "learning_rate": 0.0002, "epoch": 2.874954162082875, "step": 3920}, {"loss": 0.8786, "grad_norm": 0.5369910001754761, "learning_rate": 0.0002, "epoch": 2.8822882288228824, "step": 3930}, {"loss": 1.0015, "grad_norm": 0.7298350930213928, "learning_rate": 0.0002, "epoch": 2.88962229556289, "step": 3940}, {"loss": 0.979, "grad_norm": 0.577012836933136, "learning_rate": 0.0002, "epoch": 2.896956362302897, "step": 3950}, {"loss": 0.9716, "grad_norm": 0.5859594345092773, "learning_rate": 0.0002, "epoch": 2.9042904290429044, "step": 3960}, {"loss": 0.8772, "grad_norm": 0.47176122665405273, "learning_rate": 0.0002, "epoch": 2.9116244957829114, "step": 3970}, {"loss": 0.8997, "grad_norm": 0.9699620604515076, "learning_rate": 0.0002, "epoch": 2.918958562522919, "step": 3980}, {"loss": 0.9057, "grad_norm": 0.7908747792243958, "learning_rate": 0.0002, "epoch": 2.9262926292629263, "step": 3990}, {"loss": 0.9462, "grad_norm": 0.5777379274368286, "learning_rate": 0.0002, "epoch": 2.933626696002934, "step": 4000}, {"loss": 0.9358, "grad_norm": 0.599288284778595, "learning_rate": 0.0002, "epoch": 2.940960762742941, "step": 4010}, {"loss": 0.9812, "grad_norm": 0.5232274532318115, "learning_rate": 0.0002, "epoch": 2.9482948294829483, "step": 4020}, {"loss": 0.96, "grad_norm": 0.6395137310028076, "learning_rate": 0.0002, "epoch": 2.9556288962229558, "step": 4030}, {"loss": 0.9813, "grad_norm": 0.589260458946228, "learning_rate": 0.0002, "epoch": 2.962962962962963, "step": 4040}, {"loss": 0.9541, "grad_norm": 0.5699581503868103, "learning_rate": 0.0002, "epoch": 2.9702970297029703, "step": 4050}, {"loss": 0.9585, "grad_norm": 0.528468132019043, "learning_rate": 0.0002, "epoch": 2.9776310964429777, "step": 4060}, {"loss": 0.9164, "grad_norm": 0.4804670512676239, "learning_rate": 0.0002, "epoch": 2.984965163182985, "step": 4070}, {"loss": 0.9771, "grad_norm": 1.1918889284133911, "learning_rate": 0.0002, "epoch": 2.9922992299229922, "step": 4080}, {"loss": 0.9178, "grad_norm": 0.5479103326797485, "learning_rate": 0.0002, "epoch": 2.9996332966629997, "step": 4090}, {"eval_loss": 1.1642853021621704, "eval_runtime": 32.7511, "eval_samples_per_second": 13.16, "eval_steps_per_second": 1.649, "epoch": 2.9996332966629997, "step": 4090}, {"loss": 0.7981, "grad_norm": 0.7430027723312378, "learning_rate": 0.0002, "epoch": 3.006967363403007, "step": 4100}, {"loss": 0.7871, "grad_norm": 0.6293647289276123, "learning_rate": 0.0002, "epoch": 3.014301430143014, "step": 4110}, {"loss": 0.78, "grad_norm": 0.6191329956054688, "learning_rate": 0.0002, "epoch": 3.0216354968830217, "step": 4120}, {"loss": 0.7618, "grad_norm": 0.7959313988685608, "learning_rate": 0.0002, "epoch": 3.028969563623029, "step": 4130}, {"loss": 0.8039, "grad_norm": 0.5956351161003113, "learning_rate": 0.0002, "epoch": 3.036303630363036, "step": 4140}, {"loss": 0.7477, "grad_norm": 0.670383632183075, "learning_rate": 0.0002, "epoch": 3.0436376971030437, "step": 4150}, {"loss": 0.7984, "grad_norm": 0.6414518356323242, "learning_rate": 0.0002, "epoch": 3.050971763843051, "step": 4160}, {"loss": 0.7369, "grad_norm": 0.7928852438926697, "learning_rate": 0.0002, "epoch": 3.058305830583058, "step": 4170}, {"loss": 0.7914, "grad_norm": 0.6211121082305908, "learning_rate": 0.0002, "epoch": 3.0656398973230656, "step": 4180}, {"loss": 0.7365, "grad_norm": 0.6237057447433472, "learning_rate": 0.0002, "epoch": 3.072973964063073, "step": 4190}, {"loss": 0.702, "grad_norm": 0.6522233486175537, "learning_rate": 0.0002, "epoch": 3.08030803080308, "step": 4200}, {"loss": 0.7646, "grad_norm": 0.9396848678588867, "learning_rate": 0.0002, "epoch": 3.0876420975430876, "step": 4210}, {"loss": 0.7559, "grad_norm": 0.8003010749816895, "learning_rate": 0.0002, "epoch": 3.094976164283095, "step": 4220}, {"loss": 0.711, "grad_norm": 0.6733810305595398, "learning_rate": 0.0002, "epoch": 3.102310231023102, "step": 4230}, {"loss": 0.696, "grad_norm": 0.6365828514099121, "learning_rate": 0.0002, "epoch": 3.1096442977631096, "step": 4240}, {"loss": 0.8362, "grad_norm": 1.0805548429489136, "learning_rate": 0.0002, "epoch": 3.116978364503117, "step": 4250}, {"loss": 0.7651, "grad_norm": 0.7262141108512878, "learning_rate": 0.0002, "epoch": 3.1243124312431245, "step": 4260}, {"loss": 0.7304, "grad_norm": 0.5500539541244507, "learning_rate": 0.0002, "epoch": 3.1316464979831315, "step": 4270}, {"loss": 0.7721, "grad_norm": 0.793912947177887, "learning_rate": 0.0002, "epoch": 3.138980564723139, "step": 4280}, {"loss": 0.7708, "grad_norm": 1.2540518045425415, "learning_rate": 0.0002, "epoch": 3.1463146314631465, "step": 4290}, {"loss": 0.782, "grad_norm": 0.7020077705383301, "learning_rate": 0.0002, "epoch": 3.1536486982031535, "step": 4300}, {"loss": 0.7253, "grad_norm": 0.5111123323440552, "learning_rate": 0.0002, "epoch": 3.160982764943161, "step": 4310}, {"loss": 0.8159, "grad_norm": 0.7172090411186218, "learning_rate": 0.0002, "epoch": 3.1683168316831685, "step": 4320}, {"loss": 0.6962, "grad_norm": 0.6343168616294861, "learning_rate": 0.0002, "epoch": 3.1756508984231755, "step": 4330}, {"loss": 0.7938, "grad_norm": 0.9563672542572021, "learning_rate": 0.0002, "epoch": 3.182984965163183, "step": 4340}, {"loss": 0.7385, "grad_norm": 1.0225574970245361, "learning_rate": 0.0002, "epoch": 3.1903190319031904, "step": 4350}, {"loss": 0.8652, "grad_norm": 1.1633386611938477, "learning_rate": 0.0002, "epoch": 3.1976530986431975, "step": 4360}, {"loss": 0.7259, "grad_norm": 0.8915148973464966, "learning_rate": 0.0002, "epoch": 3.204987165383205, "step": 4370}, {"loss": 0.8061, "grad_norm": 0.9156812429428101, "learning_rate": 0.0002, "epoch": 3.2123212321232124, "step": 4380}, {"loss": 0.8189, "grad_norm": 0.6363258957862854, "learning_rate": 0.0002, "epoch": 3.21965529886322, "step": 4390}, {"loss": 0.7996, "grad_norm": 0.579099178314209, "learning_rate": 0.0002, "epoch": 3.226989365603227, "step": 4400}, {"loss": 0.8592, "grad_norm": 0.8778146505355835, "learning_rate": 0.0002, "epoch": 3.2343234323432344, "step": 4410}, {"loss": 0.8281, "grad_norm": 0.8356770873069763, "learning_rate": 0.0002, "epoch": 3.241657499083242, "step": 4420}, {"loss": 0.8484, "grad_norm": 0.702032208442688, "learning_rate": 0.0002, "epoch": 3.248991565823249, "step": 4430}, {"loss": 0.7227, "grad_norm": 0.6386539340019226, "learning_rate": 0.0002, "epoch": 3.2563256325632564, "step": 4440}, {"loss": 0.8374, "grad_norm": 0.7008408904075623, "learning_rate": 0.0002, "epoch": 3.263659699303264, "step": 4450}, {"loss": 0.7572, "grad_norm": 0.9556332230567932, "learning_rate": 0.0002, "epoch": 3.270993766043271, "step": 4460}, {"loss": 0.743, "grad_norm": 0.5667835474014282, "learning_rate": 0.0002, "epoch": 3.2783278327832783, "step": 4470}, {"loss": 0.8152, "grad_norm": 0.8239172697067261, "learning_rate": 0.0002, "epoch": 3.285661899523286, "step": 4480}, {"loss": 0.756, "grad_norm": 0.7045050859451294, "learning_rate": 0.0002, "epoch": 3.292995966263293, "step": 4490}, {"loss": 0.7655, "grad_norm": 0.7131434082984924, "learning_rate": 0.0002, "epoch": 3.3003300330033003, "step": 4500}, {"loss": 0.836, "grad_norm": 0.6924910545349121, "learning_rate": 0.0002, "epoch": 3.3076640997433078, "step": 4510}, {"loss": 0.736, "grad_norm": 0.8945356607437134, "learning_rate": 0.0002, "epoch": 3.3149981664833152, "step": 4520}, {"loss": 0.7575, "grad_norm": 0.6546903252601624, "learning_rate": 0.0002, "epoch": 3.3223322332233223, "step": 4530}, {"loss": 0.7893, "grad_norm": 0.8206679224967957, "learning_rate": 0.0002, "epoch": 3.3296662999633297, "step": 4540}, {"loss": 0.7502, "grad_norm": 0.6482203602790833, "learning_rate": 0.0002, "epoch": 3.3370003667033368, "step": 4550}, {"loss": 0.8172, "grad_norm": 0.7558760046958923, "learning_rate": 0.0002, "epoch": 3.3443344334433442, "step": 4560}, {"loss": 0.744, "grad_norm": 0.7794756889343262, "learning_rate": 0.0002, "epoch": 3.3516685001833517, "step": 4570}, {"loss": 0.7385, "grad_norm": 0.7382805943489075, "learning_rate": 0.0002, "epoch": 3.359002566923359, "step": 4580}, {"loss": 0.8511, "grad_norm": 0.5912511944770813, "learning_rate": 0.0002, "epoch": 3.366336633663366, "step": 4590}, {"loss": 0.8272, "grad_norm": 0.7444885969161987, "learning_rate": 0.0002, "epoch": 3.3736707004033737, "step": 4600}, {"loss": 0.7927, "grad_norm": 0.7354922890663147, "learning_rate": 0.0002, "epoch": 3.381004767143381, "step": 4610}, {"loss": 0.7183, "grad_norm": 0.7685934901237488, "learning_rate": 0.0002, "epoch": 3.388338833883388, "step": 4620}, {"loss": 0.7436, "grad_norm": 0.61041259765625, "learning_rate": 0.0002, "epoch": 3.3956729006233957, "step": 4630}, {"loss": 0.7661, "grad_norm": 0.6820451021194458, "learning_rate": 0.0002, "epoch": 3.403006967363403, "step": 4640}, {"loss": 0.8796, "grad_norm": 0.5819534063339233, "learning_rate": 0.0002, "epoch": 3.41034103410341, "step": 4650}, {"loss": 0.7314, "grad_norm": 0.705410897731781, "learning_rate": 0.0002, "epoch": 3.4176751008434176, "step": 4660}, {"loss": 0.7901, "grad_norm": 0.8052892088890076, "learning_rate": 0.0002, "epoch": 3.425009167583425, "step": 4670}, {"loss": 0.7298, "grad_norm": 0.7746483087539673, "learning_rate": 0.0002, "epoch": 3.432343234323432, "step": 4680}, {"loss": 0.7976, "grad_norm": 0.7713689804077148, "learning_rate": 0.0002, "epoch": 3.4396773010634396, "step": 4690}, {"loss": 0.7427, "grad_norm": 0.810371994972229, "learning_rate": 0.0002, "epoch": 3.447011367803447, "step": 4700}, {"loss": 0.7594, "grad_norm": 0.7702969312667847, "learning_rate": 0.0002, "epoch": 3.4543454345434546, "step": 4710}, {"loss": 0.7957, "grad_norm": 0.7069268822669983, "learning_rate": 0.0002, "epoch": 3.4616795012834616, "step": 4720}, {"loss": 0.8199, "grad_norm": 0.7640359401702881, "learning_rate": 0.0002, "epoch": 3.469013568023469, "step": 4730}, {"loss": 0.6875, "grad_norm": 0.8661707639694214, "learning_rate": 0.0002, "epoch": 3.4763476347634765, "step": 4740}, {"loss": 0.8528, "grad_norm": 0.9970282912254333, "learning_rate": 0.0002, "epoch": 3.4836817015034836, "step": 4750}, {"loss": 0.8462, "grad_norm": 0.5824355483055115, "learning_rate": 0.0002, "epoch": 3.491015768243491, "step": 4760}, {"loss": 0.851, "grad_norm": 1.3072649240493774, "learning_rate": 0.0002, "epoch": 3.4983498349834985, "step": 4770}, {"loss": 0.9101, "grad_norm": 0.873978316783905, "learning_rate": 0.0002, "epoch": 3.5056839017235055, "step": 4780}, {"loss": 0.7403, "grad_norm": 0.5526657104492188, "learning_rate": 0.0002, "epoch": 3.513017968463513, "step": 4790}, {"loss": 0.7921, "grad_norm": 0.790894627571106, "learning_rate": 0.0002, "epoch": 3.5203520352035205, "step": 4800}, {"loss": 0.831, "grad_norm": 0.8119630217552185, "learning_rate": 0.0002, "epoch": 3.5276861019435275, "step": 4810}, {"loss": 0.7351, "grad_norm": 0.633212149143219, "learning_rate": 0.0002, "epoch": 3.535020168683535, "step": 4820}, {"loss": 0.8505, "grad_norm": 0.703029990196228, "learning_rate": 0.0002, "epoch": 3.5423542354235424, "step": 4830}, {"loss": 0.7204, "grad_norm": 0.7603771686553955, "learning_rate": 0.0002, "epoch": 3.54968830216355, "step": 4840}, {"loss": 0.8868, "grad_norm": 0.6260480880737305, "learning_rate": 0.0002, "epoch": 3.557022368903557, "step": 4850}, {"loss": 0.8137, "grad_norm": 0.8203664422035217, "learning_rate": 0.0002, "epoch": 3.5643564356435644, "step": 4860}, {"loss": 0.8821, "grad_norm": 0.7793813347816467, "learning_rate": 0.0002, "epoch": 3.5716905023835714, "step": 4870}, {"loss": 0.8164, "grad_norm": 0.7667397260665894, "learning_rate": 0.0002, "epoch": 3.579024569123579, "step": 4880}, {"loss": 0.7597, "grad_norm": 0.8198829889297485, "learning_rate": 0.0002, "epoch": 3.5863586358635864, "step": 4890}, {"loss": 0.7027, "grad_norm": 0.7689233422279358, "learning_rate": 0.0002, "epoch": 3.593692702603594, "step": 4900}, {"loss": 0.804, "grad_norm": 0.7870983481407166, "learning_rate": 0.0002, "epoch": 3.601026769343601, "step": 4910}, {"loss": 0.8269, "grad_norm": 0.8133853077888489, "learning_rate": 0.0002, "epoch": 3.6083608360836084, "step": 4920}, {"loss": 0.8515, "grad_norm": 1.308401346206665, "learning_rate": 0.0002, "epoch": 3.615694902823616, "step": 4930}, {"loss": 0.8494, "grad_norm": 0.7131121754646301, "learning_rate": 0.0002, "epoch": 3.623028969563623, "step": 4940}, {"loss": 0.7235, "grad_norm": 0.6825910210609436, "learning_rate": 0.0002, "epoch": 3.6303630363036303, "step": 4950}, {"loss": 0.7824, "grad_norm": 0.7254678606987, "learning_rate": 0.0002, "epoch": 3.637697103043638, "step": 4960}, {"loss": 0.7983, "grad_norm": 0.8045085072517395, "learning_rate": 0.0002, "epoch": 3.6450311697836453, "step": 4970}, {"loss": 0.8223, "grad_norm": 0.6991777420043945, "learning_rate": 0.0002, "epoch": 3.6523652365236523, "step": 4980}, {"loss": 0.7806, "grad_norm": 0.7804713249206543, "learning_rate": 0.0002, "epoch": 3.6596993032636598, "step": 4990}, {"loss": 0.8402, "grad_norm": 0.8525708317756653, "learning_rate": 0.0002, "epoch": 3.667033370003667, "step": 5000}, {"loss": 0.8496, "grad_norm": 0.7959994673728943, "learning_rate": 0.0002, "epoch": 3.6743674367436743, "step": 5010}, {"loss": 0.8022, "grad_norm": 0.8103628158569336, "learning_rate": 0.0002, "epoch": 3.6817015034836817, "step": 5020}, {"loss": 0.7376, "grad_norm": 0.7517836093902588, "learning_rate": 0.0002, "epoch": 3.689035570223689, "step": 5030}, {"loss": 0.8375, "grad_norm": 0.6878514289855957, "learning_rate": 0.0002, "epoch": 3.6963696369636962, "step": 5040}, {"loss": 0.7998, "grad_norm": 1.2371820211410522, "learning_rate": 0.0002, "epoch": 3.7037037037037037, "step": 5050}, {"loss": 0.6941, "grad_norm": 0.6567103862762451, "learning_rate": 0.0002, "epoch": 3.711037770443711, "step": 5060}, {"loss": 0.8465, "grad_norm": 1.1254922151565552, "learning_rate": 0.0002, "epoch": 3.718371837183718, "step": 5070}, {"loss": 0.8365, "grad_norm": 0.6796132326126099, "learning_rate": 0.0002, "epoch": 3.7257059039237257, "step": 5080}, {"loss": 0.7818, "grad_norm": 0.7285300493240356, "learning_rate": 0.0002, "epoch": 3.733039970663733, "step": 5090}, {"loss": 0.8581, "grad_norm": 0.8931500911712646, "learning_rate": 0.0002, "epoch": 3.7403740374037406, "step": 5100}, {"loss": 0.8181, "grad_norm": 0.6256856918334961, "learning_rate": 0.0002, "epoch": 3.7477081041437477, "step": 5110}, {"loss": 0.743, "grad_norm": 0.79310142993927, "learning_rate": 0.0002, "epoch": 3.755042170883755, "step": 5120}, {"loss": 0.8235, "grad_norm": 0.6594041585922241, "learning_rate": 0.0002, "epoch": 3.762376237623762, "step": 5130}, {"loss": 0.6925, "grad_norm": 0.7029327750205994, "learning_rate": 0.0002, "epoch": 3.7697103043637696, "step": 5140}, {"loss": 0.7457, "grad_norm": 0.5880070328712463, "learning_rate": 0.0002, "epoch": 3.777044371103777, "step": 5150}, {"loss": 0.8716, "grad_norm": 0.7578945159912109, "learning_rate": 0.0002, "epoch": 3.7843784378437846, "step": 5160}, {"loss": 0.8819, "grad_norm": 0.8276378512382507, "learning_rate": 0.0002, "epoch": 3.7917125045837916, "step": 5170}, {"loss": 0.7559, "grad_norm": 0.7627953886985779, "learning_rate": 0.0002, "epoch": 3.799046571323799, "step": 5180}, {"loss": 0.7665, "grad_norm": 0.8169086575508118, "learning_rate": 0.0002, "epoch": 3.806380638063806, "step": 5190}, {"loss": 0.761, "grad_norm": 0.6605030298233032, "learning_rate": 0.0002, "epoch": 3.8137147048038136, "step": 5200}, {"loss": 0.8804, "grad_norm": 0.5837286114692688, "learning_rate": 0.0002, "epoch": 3.821048771543821, "step": 5210}, {"loss": 0.8369, "grad_norm": 1.2422157526016235, "learning_rate": 0.0002, "epoch": 3.8283828382838285, "step": 5220}, {"loss": 0.8431, "grad_norm": 0.6589220762252808, "learning_rate": 0.0002, "epoch": 3.8357169050238356, "step": 5230}, {"loss": 0.7686, "grad_norm": 0.8567556142807007, "learning_rate": 0.0002, "epoch": 3.843050971763843, "step": 5240}, {"loss": 0.8652, "grad_norm": 0.6490627527236938, "learning_rate": 0.0002, "epoch": 3.8503850385038505, "step": 5250}, {"loss": 0.7386, "grad_norm": 0.620232880115509, "learning_rate": 0.0002, "epoch": 3.8577191052438575, "step": 5260}, {"loss": 0.9192, "grad_norm": 0.7685128450393677, "learning_rate": 0.0002, "epoch": 3.865053171983865, "step": 5270}, {"loss": 0.872, "grad_norm": 0.8113296627998352, "learning_rate": 0.0002, "epoch": 3.8723872387238725, "step": 5280}, {"loss": 0.7156, "grad_norm": 0.8092675805091858, "learning_rate": 0.0002, "epoch": 3.87972130546388, "step": 5290}, {"loss": 0.7325, "grad_norm": 0.583570122718811, "learning_rate": 0.0002, "epoch": 3.887055372203887, "step": 5300}, {"loss": 0.9333, "grad_norm": 1.712363600730896, "learning_rate": 0.0002, "epoch": 3.8943894389438944, "step": 5310}, {"loss": 0.7537, "grad_norm": 0.6673534512519836, "learning_rate": 0.0002, "epoch": 3.9017235056839015, "step": 5320}, {"loss": 0.7035, "grad_norm": 1.9770312309265137, "learning_rate": 0.0002, "epoch": 3.909057572423909, "step": 5330}, {"loss": 0.8793, "grad_norm": 0.6430999636650085, "learning_rate": 0.0002, "epoch": 3.9163916391639164, "step": 5340}, {"loss": 0.839, "grad_norm": 1.0159571170806885, "learning_rate": 0.0002, "epoch": 3.923725705903924, "step": 5350}, {"loss": 0.9332, "grad_norm": 0.8607584834098816, "learning_rate": 0.0002, "epoch": 3.931059772643931, "step": 5360}, {"loss": 0.7261, "grad_norm": 0.6967900991439819, "learning_rate": 0.0002, "epoch": 3.9383938393839384, "step": 5370}, {"loss": 0.8456, "grad_norm": 0.7683077454566956, "learning_rate": 0.0002, "epoch": 3.945727906123946, "step": 5380}, {"loss": 0.7682, "grad_norm": 0.6805762648582458, "learning_rate": 0.0002, "epoch": 3.953061972863953, "step": 5390}, {"loss": 0.7746, "grad_norm": 0.7033619284629822, "learning_rate": 0.0002, "epoch": 3.9603960396039604, "step": 5400}, {"loss": 0.8393, "grad_norm": 0.966112494468689, "learning_rate": 0.0002, "epoch": 3.967730106343968, "step": 5410}, {"loss": 0.8316, "grad_norm": 0.8467881083488464, "learning_rate": 0.0002, "epoch": 3.9750641730839753, "step": 5420}, {"loss": 0.8084, "grad_norm": 0.8005317449569702, "learning_rate": 0.0002, "epoch": 3.9823982398239823, "step": 5430}, {"loss": 0.7168, "grad_norm": 1.1615241765975952, "learning_rate": 0.0002, "epoch": 3.98973230656399, "step": 5440}, {"loss": 0.8263, "grad_norm": 0.6121614575386047, "learning_rate": 0.0002, "epoch": 3.997066373303997, "step": 5450}]} +{"epoch": 4.999633296662999, "step": 6817, "epoch_duration": 1471.577754497528, "total_accumulated_duration": 7377.027250051498, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9722, "grad_norm": 0.47521963715553284, "learning_rate": 0.0002, "epoch": 0.007334066740007334, "step": 10}, {"loss": 1.4821, "grad_norm": 0.5395162105560303, "learning_rate": 0.0002, "epoch": 0.014668133480014669, "step": 20}, {"loss": 1.4202, "grad_norm": 0.4305780231952667, "learning_rate": 0.0002, "epoch": 0.022002200220022004, "step": 30}, {"loss": 1.4271, "grad_norm": 0.6938246488571167, "learning_rate": 0.0002, "epoch": 0.029336266960029337, "step": 40}, {"loss": 1.3112, "grad_norm": 1.5133819580078125, "learning_rate": 0.0002, "epoch": 0.03667033370003667, "step": 50}, {"loss": 1.3132, "grad_norm": 0.9173883199691772, "learning_rate": 0.0002, "epoch": 0.04400440044004401, "step": 60}, {"loss": 1.2844, "grad_norm": 0.4619861841201782, "learning_rate": 0.0002, "epoch": 0.05133846718005134, "step": 70}, {"loss": 1.2108, "grad_norm": 0.46118637919425964, "learning_rate": 0.0002, "epoch": 0.058672533920058674, "step": 80}, {"loss": 1.3441, "grad_norm": 0.4468648135662079, "learning_rate": 0.0002, "epoch": 0.066006600660066, "step": 90}, {"loss": 1.1863, "grad_norm": 0.46123769879341125, "learning_rate": 0.0002, "epoch": 0.07334066740007333, "step": 100}, {"loss": 1.2772, "grad_norm": 0.4859139025211334, "learning_rate": 0.0002, "epoch": 0.08067473414008068, "step": 110}, {"loss": 1.2087, "grad_norm": 0.4384922385215759, "learning_rate": 0.0002, "epoch": 0.08800880088008801, "step": 120}, {"loss": 1.2927, "grad_norm": 0.39519360661506653, "learning_rate": 0.0002, "epoch": 0.09534286762009535, "step": 130}, {"loss": 1.2349, "grad_norm": 0.4049859344959259, "learning_rate": 0.0002, "epoch": 0.10267693436010268, "step": 140}, {"loss": 1.293, "grad_norm": 0.4605638086795807, "learning_rate": 0.0002, "epoch": 0.11001100110011001, "step": 150}, {"loss": 1.2659, "grad_norm": 0.4201928377151489, "learning_rate": 0.0002, "epoch": 0.11734506784011735, "step": 160}, {"loss": 1.3961, "grad_norm": 0.5367777347564697, "learning_rate": 0.0002, "epoch": 0.12467913458012468, "step": 170}, {"loss": 1.2481, "grad_norm": 0.41752299666404724, "learning_rate": 0.0002, "epoch": 0.132013201320132, "step": 180}, {"loss": 1.207, "grad_norm": 0.31597763299942017, "learning_rate": 0.0002, "epoch": 0.13934726806013933, "step": 190}, {"loss": 1.2441, "grad_norm": 0.7468788623809814, "learning_rate": 0.0002, "epoch": 0.14668133480014667, "step": 200}, {"loss": 1.199, "grad_norm": 0.3403034508228302, "learning_rate": 0.0002, "epoch": 0.15401540154015403, "step": 210}, {"loss": 1.2439, "grad_norm": 0.34240293502807617, "learning_rate": 0.0002, "epoch": 0.16134946828016136, "step": 220}, {"loss": 1.2022, "grad_norm": 0.356158971786499, "learning_rate": 0.0002, "epoch": 0.1686835350201687, "step": 230}, {"loss": 1.207, "grad_norm": 0.3448857367038727, "learning_rate": 0.0002, "epoch": 0.17601760176017603, "step": 240}, {"loss": 1.2156, "grad_norm": 0.3475699722766876, "learning_rate": 0.0002, "epoch": 0.18335166850018336, "step": 250}, {"loss": 1.1551, "grad_norm": 0.2770358622074127, "learning_rate": 0.0002, "epoch": 0.1906857352401907, "step": 260}, {"loss": 1.2238, "grad_norm": 0.4310270845890045, "learning_rate": 0.0002, "epoch": 0.19801980198019803, "step": 270}, {"loss": 1.2917, "grad_norm": 0.335041880607605, "learning_rate": 0.0002, "epoch": 0.20535386872020536, "step": 280}, {"loss": 1.0959, "grad_norm": 0.3420602083206177, "learning_rate": 0.0002, "epoch": 0.2126879354602127, "step": 290}, {"loss": 1.1232, "grad_norm": 0.325001060962677, "learning_rate": 0.0002, "epoch": 0.22002200220022003, "step": 300}, {"loss": 1.2007, "grad_norm": 0.3027827739715576, "learning_rate": 0.0002, "epoch": 0.22735606894022736, "step": 310}, {"loss": 1.1803, "grad_norm": 0.435550719499588, "learning_rate": 0.0002, "epoch": 0.2346901356802347, "step": 320}, {"loss": 1.2045, "grad_norm": 0.3884522616863251, "learning_rate": 0.0002, "epoch": 0.24202420242024203, "step": 330}, {"loss": 1.2481, "grad_norm": 0.7736002206802368, "learning_rate": 0.0002, "epoch": 0.24935826916024936, "step": 340}, {"loss": 1.3606, "grad_norm": 0.35052821040153503, "learning_rate": 0.0002, "epoch": 0.2566923359002567, "step": 350}, {"loss": 1.2129, "grad_norm": 0.3311890959739685, "learning_rate": 0.0002, "epoch": 0.264026402640264, "step": 360}, {"loss": 1.2219, "grad_norm": 0.7473500370979309, "learning_rate": 0.0002, "epoch": 0.27136046938027136, "step": 370}, {"loss": 1.2712, "grad_norm": 0.3681875765323639, "learning_rate": 0.0002, "epoch": 0.27869453612027867, "step": 380}, {"loss": 1.2258, "grad_norm": 0.3764737844467163, "learning_rate": 0.0002, "epoch": 0.28602860286028603, "step": 390}, {"loss": 1.1917, "grad_norm": 0.4243989586830139, "learning_rate": 0.0002, "epoch": 0.29336266960029334, "step": 400}, {"loss": 1.199, "grad_norm": 0.2658531963825226, "learning_rate": 0.0002, "epoch": 0.3006967363403007, "step": 410}, {"loss": 1.1622, "grad_norm": 0.3436793386936188, "learning_rate": 0.0002, "epoch": 0.30803080308030806, "step": 420}, {"loss": 1.2953, "grad_norm": 0.5101129412651062, "learning_rate": 0.0002, "epoch": 0.31536486982031536, "step": 430}, {"loss": 1.1557, "grad_norm": 0.3319750726222992, "learning_rate": 0.0002, "epoch": 0.3226989365603227, "step": 440}, {"loss": 1.1804, "grad_norm": 0.385148286819458, "learning_rate": 0.0002, "epoch": 0.33003300330033003, "step": 450}, {"loss": 1.1808, "grad_norm": 0.3477935791015625, "learning_rate": 0.0002, "epoch": 0.3373670700403374, "step": 460}, {"loss": 1.1877, "grad_norm": 0.29748716950416565, "learning_rate": 0.0002, "epoch": 0.3447011367803447, "step": 470}, {"loss": 1.19, "grad_norm": 0.34083324670791626, "learning_rate": 0.0002, "epoch": 0.35203520352035206, "step": 480}, {"loss": 1.2, "grad_norm": 0.36904552578926086, "learning_rate": 0.0002, "epoch": 0.35936927026035936, "step": 490}, {"loss": 1.2223, "grad_norm": 0.315483033657074, "learning_rate": 0.0002, "epoch": 0.3667033370003667, "step": 500}, {"loss": 1.1461, "grad_norm": 0.44897955656051636, "learning_rate": 0.0002, "epoch": 0.37403740374037403, "step": 510}, {"loss": 1.3035, "grad_norm": 0.3160701394081116, "learning_rate": 0.0002, "epoch": 0.3813714704803814, "step": 520}, {"loss": 1.3197, "grad_norm": 0.29584741592407227, "learning_rate": 0.0002, "epoch": 0.3887055372203887, "step": 530}, {"loss": 1.2983, "grad_norm": 0.5430002808570862, "learning_rate": 0.0002, "epoch": 0.39603960396039606, "step": 540}, {"loss": 1.2459, "grad_norm": 0.2908070683479309, "learning_rate": 0.0002, "epoch": 0.40337367070040336, "step": 550}, {"loss": 1.2384, "grad_norm": 0.35066530108451843, "learning_rate": 0.0002, "epoch": 0.4107077374404107, "step": 560}, {"loss": 1.1784, "grad_norm": 0.37588003277778625, "learning_rate": 0.0002, "epoch": 0.41804180418041803, "step": 570}, {"loss": 1.2334, "grad_norm": 0.3112126886844635, "learning_rate": 0.0002, "epoch": 0.4253758709204254, "step": 580}, {"loss": 1.1439, "grad_norm": 0.35577139258384705, "learning_rate": 0.0002, "epoch": 0.4327099376604327, "step": 590}, {"loss": 1.184, "grad_norm": 0.31706422567367554, "learning_rate": 0.0002, "epoch": 0.44004400440044006, "step": 600}, {"loss": 1.2081, "grad_norm": 0.3249092102050781, "learning_rate": 0.0002, "epoch": 0.44737807114044736, "step": 610}, {"loss": 1.0824, "grad_norm": 0.3842705488204956, "learning_rate": 0.0002, "epoch": 0.4547121378804547, "step": 620}, {"loss": 1.2257, "grad_norm": 0.390991747379303, "learning_rate": 0.0002, "epoch": 0.46204620462046203, "step": 630}, {"loss": 1.1954, "grad_norm": 0.27532413601875305, "learning_rate": 0.0002, "epoch": 0.4693802713604694, "step": 640}, {"loss": 1.1058, "grad_norm": 0.31412816047668457, "learning_rate": 0.0002, "epoch": 0.4767143381004767, "step": 650}, {"loss": 1.1312, "grad_norm": 0.32117101550102234, "learning_rate": 0.0002, "epoch": 0.48404840484048406, "step": 660}, {"loss": 1.2423, "grad_norm": 0.3810010254383087, "learning_rate": 0.0002, "epoch": 0.49138247158049136, "step": 670}, {"loss": 1.1978, "grad_norm": 0.36289164423942566, "learning_rate": 0.0002, "epoch": 0.4987165383204987, "step": 680}, {"loss": 1.2034, "grad_norm": 0.34458720684051514, "learning_rate": 0.0002, "epoch": 0.506050605060506, "step": 690}, {"loss": 1.1756, "grad_norm": 0.32844600081443787, "learning_rate": 0.0002, "epoch": 0.5133846718005134, "step": 700}, {"loss": 1.0807, "grad_norm": 0.3144175708293915, "learning_rate": 0.0002, "epoch": 0.5207187385405208, "step": 710}, {"loss": 1.1952, "grad_norm": 0.3898887634277344, "learning_rate": 0.0002, "epoch": 0.528052805280528, "step": 720}, {"loss": 1.1244, "grad_norm": 1.3220758438110352, "learning_rate": 0.0002, "epoch": 0.5353868720205354, "step": 730}, {"loss": 1.227, "grad_norm": 0.3635874390602112, "learning_rate": 0.0002, "epoch": 0.5427209387605427, "step": 740}, {"loss": 1.2169, "grad_norm": 0.3138217628002167, "learning_rate": 0.0002, "epoch": 0.5500550055005501, "step": 750}, {"loss": 1.1516, "grad_norm": 0.4063207805156708, "learning_rate": 0.0002, "epoch": 0.5573890722405573, "step": 760}, {"loss": 1.1954, "grad_norm": 0.3926219940185547, "learning_rate": 0.0002, "epoch": 0.5647231389805647, "step": 770}, {"loss": 1.1726, "grad_norm": 0.31954652070999146, "learning_rate": 0.0002, "epoch": 0.5720572057205721, "step": 780}, {"loss": 1.2977, "grad_norm": 0.4248711168766022, "learning_rate": 0.0002, "epoch": 0.5793912724605794, "step": 790}, {"loss": 1.1728, "grad_norm": 0.643004834651947, "learning_rate": 0.0002, "epoch": 0.5867253392005867, "step": 800}, {"loss": 1.1793, "grad_norm": 0.3479592800140381, "learning_rate": 0.0002, "epoch": 0.594059405940594, "step": 810}, {"loss": 1.2426, "grad_norm": 0.4684754014015198, "learning_rate": 0.0002, "epoch": 0.6013934726806014, "step": 820}, {"loss": 1.2002, "grad_norm": 0.3739790916442871, "learning_rate": 0.0002, "epoch": 0.6087275394206088, "step": 830}, {"loss": 1.2139, "grad_norm": 0.40884748101234436, "learning_rate": 0.0002, "epoch": 0.6160616061606161, "step": 840}, {"loss": 1.1557, "grad_norm": 0.9722164273262024, "learning_rate": 0.0002, "epoch": 0.6233956729006234, "step": 850}, {"loss": 1.3069, "grad_norm": 0.42992347478866577, "learning_rate": 0.0002, "epoch": 0.6307297396406307, "step": 860}, {"loss": 1.1339, "grad_norm": 0.36654195189476013, "learning_rate": 0.0002, "epoch": 0.6380638063806381, "step": 870}, {"loss": 1.1932, "grad_norm": 0.4113832116127014, "learning_rate": 0.0002, "epoch": 0.6453978731206454, "step": 880}, {"loss": 1.2163, "grad_norm": 0.2948838770389557, "learning_rate": 0.0002, "epoch": 0.6527319398606527, "step": 890}, {"loss": 1.1081, "grad_norm": 0.38330280780792236, "learning_rate": 0.0002, "epoch": 0.6600660066006601, "step": 900}, {"loss": 1.1342, "grad_norm": 0.4428867697715759, "learning_rate": 0.0002, "epoch": 0.6674000733406674, "step": 910}, {"loss": 1.1021, "grad_norm": 0.23659265041351318, "learning_rate": 0.0002, "epoch": 0.6747341400806748, "step": 920}, {"loss": 1.1226, "grad_norm": 0.323685884475708, "learning_rate": 0.0002, "epoch": 0.682068206820682, "step": 930}, {"loss": 1.0853, "grad_norm": 0.39157727360725403, "learning_rate": 0.0002, "epoch": 0.6894022735606894, "step": 940}, {"loss": 1.1435, "grad_norm": 0.27189481258392334, "learning_rate": 0.0002, "epoch": 0.6967363403006968, "step": 950}, {"loss": 1.1033, "grad_norm": 0.529883861541748, "learning_rate": 0.0002, "epoch": 0.7040704070407041, "step": 960}, {"loss": 1.139, "grad_norm": 0.34758689999580383, "learning_rate": 0.0002, "epoch": 0.7114044737807114, "step": 970}, {"loss": 1.2197, "grad_norm": 0.831749439239502, "learning_rate": 0.0002, "epoch": 0.7187385405207187, "step": 980}, {"loss": 1.158, "grad_norm": 0.4438304007053375, "learning_rate": 0.0002, "epoch": 0.7260726072607261, "step": 990}, {"loss": 1.1021, "grad_norm": 0.33840006589889526, "learning_rate": 0.0002, "epoch": 0.7334066740007334, "step": 1000}, {"loss": 1.254, "grad_norm": 0.3454797863960266, "learning_rate": 0.0002, "epoch": 0.7407407407407407, "step": 1010}, {"loss": 1.106, "grad_norm": 0.38999441266059875, "learning_rate": 0.0002, "epoch": 0.7480748074807481, "step": 1020}, {"loss": 1.1428, "grad_norm": 0.2829911708831787, "learning_rate": 0.0002, "epoch": 0.7554088742207554, "step": 1030}, {"loss": 1.2123, "grad_norm": 0.36918163299560547, "learning_rate": 0.0002, "epoch": 0.7627429409607628, "step": 1040}, {"loss": 1.3028, "grad_norm": 0.3415680229663849, "learning_rate": 0.0002, "epoch": 0.77007700770077, "step": 1050}, {"loss": 1.1939, "grad_norm": 0.2974182963371277, "learning_rate": 0.0002, "epoch": 0.7774110744407774, "step": 1060}, {"loss": 1.194, "grad_norm": 0.3880919814109802, "learning_rate": 0.0002, "epoch": 0.7847451411807848, "step": 1070}, {"loss": 1.1095, "grad_norm": 0.33503302931785583, "learning_rate": 0.0002, "epoch": 0.7920792079207921, "step": 1080}, {"loss": 1.2111, "grad_norm": 0.3728407025337219, "learning_rate": 0.0002, "epoch": 0.7994132746607994, "step": 1090}, {"loss": 1.0835, "grad_norm": 0.3509373664855957, "learning_rate": 0.0002, "epoch": 0.8067473414008067, "step": 1100}, {"loss": 1.2661, "grad_norm": 0.42228564620018005, "learning_rate": 0.0002, "epoch": 0.8140814081408141, "step": 1110}, {"loss": 1.1788, "grad_norm": 0.313467800617218, "learning_rate": 0.0002, "epoch": 0.8214154748808215, "step": 1120}, {"loss": 1.1971, "grad_norm": 0.3378850817680359, "learning_rate": 0.0002, "epoch": 0.8287495416208287, "step": 1130}, {"loss": 1.1238, "grad_norm": 0.43200382590293884, "learning_rate": 0.0002, "epoch": 0.8360836083608361, "step": 1140}, {"loss": 1.3203, "grad_norm": 0.3309599459171295, "learning_rate": 0.0002, "epoch": 0.8434176751008434, "step": 1150}, {"loss": 1.1062, "grad_norm": 0.3526846170425415, "learning_rate": 0.0002, "epoch": 0.8507517418408508, "step": 1160}, {"loss": 1.0851, "grad_norm": 1.2722247838974, "learning_rate": 0.0002, "epoch": 0.858085808580858, "step": 1170}, {"loss": 1.0785, "grad_norm": 0.34142059087753296, "learning_rate": 0.0002, "epoch": 0.8654198753208654, "step": 1180}, {"loss": 1.2187, "grad_norm": 0.3805823028087616, "learning_rate": 0.0002, "epoch": 0.8727539420608728, "step": 1190}, {"loss": 1.1215, "grad_norm": 0.3931232690811157, "learning_rate": 0.0002, "epoch": 0.8800880088008801, "step": 1200}, {"loss": 1.0948, "grad_norm": 0.2937372624874115, "learning_rate": 0.0002, "epoch": 0.8874220755408874, "step": 1210}, {"loss": 1.1228, "grad_norm": 0.3757196366786957, "learning_rate": 0.0002, "epoch": 0.8947561422808947, "step": 1220}, {"loss": 1.1222, "grad_norm": 0.3502705991268158, "learning_rate": 0.0002, "epoch": 0.9020902090209021, "step": 1230}, {"loss": 1.2242, "grad_norm": 0.32758915424346924, "learning_rate": 0.0002, "epoch": 0.9094242757609095, "step": 1240}, {"loss": 1.215, "grad_norm": 0.37199416756629944, "learning_rate": 0.0002, "epoch": 0.9167583425009168, "step": 1250}, {"loss": 1.1225, "grad_norm": 0.3551490604877472, "learning_rate": 0.0002, "epoch": 0.9240924092409241, "step": 1260}, {"loss": 1.1966, "grad_norm": 0.2859550714492798, "learning_rate": 0.0002, "epoch": 0.9314264759809314, "step": 1270}, {"loss": 1.2186, "grad_norm": 0.427990585565567, "learning_rate": 0.0002, "epoch": 0.9387605427209388, "step": 1280}, {"loss": 1.2848, "grad_norm": 0.33717992901802063, "learning_rate": 0.0002, "epoch": 0.9460946094609461, "step": 1290}, {"loss": 1.1656, "grad_norm": 0.30225634574890137, "learning_rate": 0.0002, "epoch": 0.9534286762009534, "step": 1300}, {"loss": 1.2404, "grad_norm": 0.385821133852005, "learning_rate": 0.0002, "epoch": 0.9607627429409608, "step": 1310}, {"loss": 1.1932, "grad_norm": 0.35278066992759705, "learning_rate": 0.0002, "epoch": 0.9680968096809681, "step": 1320}, {"loss": 1.1071, "grad_norm": 0.49987098574638367, "learning_rate": 0.0002, "epoch": 0.9754308764209755, "step": 1330}, {"loss": 1.2259, "grad_norm": 0.3842747211456299, "learning_rate": 0.0002, "epoch": 0.9827649431609827, "step": 1340}, {"loss": 1.0862, "grad_norm": 0.6274653673171997, "learning_rate": 0.0002, "epoch": 0.9900990099009901, "step": 1350}, {"loss": 1.124, "grad_norm": 0.5239808559417725, "learning_rate": 0.0002, "epoch": 0.9974330766409975, "step": 1360}, {"eval_loss": 1.1822267770767212, "eval_runtime": 32.7389, "eval_samples_per_second": 13.165, "eval_steps_per_second": 1.649, "epoch": 0.9996332966629996, "step": 1363}, {"loss": 1.096, "grad_norm": 0.45311301946640015, "learning_rate": 0.0002, "epoch": 1.0047671433810048, "step": 1370}, {"loss": 1.0143, "grad_norm": 0.29685574769973755, "learning_rate": 0.0002, "epoch": 1.012101210121012, "step": 1380}, {"loss": 1.0302, "grad_norm": 0.3290937840938568, "learning_rate": 0.0002, "epoch": 1.0194352768610195, "step": 1390}, {"loss": 1.0295, "grad_norm": 0.3801758587360382, "learning_rate": 0.0002, "epoch": 1.0267693436010268, "step": 1400}, {"loss": 1.1226, "grad_norm": 0.794174313545227, "learning_rate": 0.0002, "epoch": 1.034103410341034, "step": 1410}, {"loss": 1.2232, "grad_norm": 0.3854154646396637, "learning_rate": 0.0002, "epoch": 1.0414374770810415, "step": 1420}, {"loss": 1.0652, "grad_norm": 0.32702451944351196, "learning_rate": 0.0002, "epoch": 1.0487715438210488, "step": 1430}, {"loss": 1.1144, "grad_norm": 0.7815203666687012, "learning_rate": 0.0002, "epoch": 1.056105610561056, "step": 1440}, {"loss": 1.1316, "grad_norm": 0.3087436854839325, "learning_rate": 0.0002, "epoch": 1.0634396773010635, "step": 1450}, {"loss": 1.1124, "grad_norm": 0.3847602903842926, "learning_rate": 0.0002, "epoch": 1.0707737440410707, "step": 1460}, {"loss": 1.1428, "grad_norm": 0.3693031370639801, "learning_rate": 0.0002, "epoch": 1.0781078107810782, "step": 1470}, {"loss": 1.0995, "grad_norm": 0.4111202359199524, "learning_rate": 0.0002, "epoch": 1.0854418775210855, "step": 1480}, {"loss": 1.0961, "grad_norm": 0.41452381014823914, "learning_rate": 0.0002, "epoch": 1.0927759442610927, "step": 1490}, {"loss": 1.1068, "grad_norm": 0.3336445093154907, "learning_rate": 0.0002, "epoch": 1.1001100110011002, "step": 1500}, {"loss": 1.0556, "grad_norm": 0.3923407793045044, "learning_rate": 0.0002, "epoch": 1.1074440777411074, "step": 1510}, {"loss": 1.1644, "grad_norm": 0.46215683221817017, "learning_rate": 0.0002, "epoch": 1.1147781444811147, "step": 1520}, {"loss": 1.1133, "grad_norm": 0.3592156767845154, "learning_rate": 0.0002, "epoch": 1.1221122112211221, "step": 1530}, {"loss": 1.0957, "grad_norm": 0.361110657453537, "learning_rate": 0.0002, "epoch": 1.1294462779611294, "step": 1540}, {"loss": 1.1553, "grad_norm": 0.5317131280899048, "learning_rate": 0.0002, "epoch": 1.1367803447011369, "step": 1550}, {"loss": 1.0368, "grad_norm": 0.3882388174533844, "learning_rate": 0.0002, "epoch": 1.1441144114411441, "step": 1560}, {"loss": 1.0805, "grad_norm": 0.3259428143501282, "learning_rate": 0.0002, "epoch": 1.1514484781811514, "step": 1570}, {"loss": 1.1819, "grad_norm": 0.410935640335083, "learning_rate": 0.0002, "epoch": 1.1587825449211588, "step": 1580}, {"loss": 1.1143, "grad_norm": 0.44940185546875, "learning_rate": 0.0002, "epoch": 1.166116611661166, "step": 1590}, {"loss": 1.0334, "grad_norm": 0.5106484293937683, "learning_rate": 0.0002, "epoch": 1.1734506784011733, "step": 1600}, {"loss": 1.2376, "grad_norm": 0.6603665947914124, "learning_rate": 0.0002, "epoch": 1.1807847451411808, "step": 1610}, {"loss": 1.1227, "grad_norm": 0.4799964129924774, "learning_rate": 0.0002, "epoch": 1.188118811881188, "step": 1620}, {"loss": 1.1191, "grad_norm": 0.4389883279800415, "learning_rate": 0.0002, "epoch": 1.1954528786211955, "step": 1630}, {"loss": 1.0667, "grad_norm": 0.4188813269138336, "learning_rate": 0.0002, "epoch": 1.2027869453612028, "step": 1640}, {"loss": 1.0605, "grad_norm": 0.7132157683372498, "learning_rate": 0.0002, "epoch": 1.21012101210121, "step": 1650}, {"loss": 1.0204, "grad_norm": 0.507480263710022, "learning_rate": 0.0002, "epoch": 1.2174550788412175, "step": 1660}, {"loss": 0.9948, "grad_norm": 0.9452332854270935, "learning_rate": 0.0002, "epoch": 1.2247891455812248, "step": 1670}, {"loss": 1.0228, "grad_norm": 0.4121614992618561, "learning_rate": 0.0002, "epoch": 1.2321232123212322, "step": 1680}, {"loss": 1.0366, "grad_norm": 0.34230247139930725, "learning_rate": 0.0002, "epoch": 1.2394572790612395, "step": 1690}, {"loss": 1.1289, "grad_norm": 0.4026208817958832, "learning_rate": 0.0002, "epoch": 1.2467913458012467, "step": 1700}, {"loss": 1.0206, "grad_norm": 0.46673697233200073, "learning_rate": 0.0002, "epoch": 1.2541254125412542, "step": 1710}, {"loss": 1.0827, "grad_norm": 0.38349825143814087, "learning_rate": 0.0002, "epoch": 1.2614594792812615, "step": 1720}, {"loss": 1.0356, "grad_norm": 0.4049997627735138, "learning_rate": 0.0002, "epoch": 1.2687935460212687, "step": 1730}, {"loss": 0.9504, "grad_norm": 0.3417615294456482, "learning_rate": 0.0002, "epoch": 1.2761276127612762, "step": 1740}, {"loss": 1.094, "grad_norm": 0.4277614951133728, "learning_rate": 0.0002, "epoch": 1.2834616795012834, "step": 1750}, {"loss": 0.9938, "grad_norm": 0.5864202976226807, "learning_rate": 0.0002, "epoch": 1.2907957462412907, "step": 1760}, {"loss": 1.1167, "grad_norm": 0.7097493410110474, "learning_rate": 0.0002, "epoch": 1.2981298129812981, "step": 1770}, {"loss": 1.1132, "grad_norm": 0.3145381212234497, "learning_rate": 0.0002, "epoch": 1.3054638797213054, "step": 1780}, {"loss": 1.1099, "grad_norm": 0.5116165280342102, "learning_rate": 0.0002, "epoch": 1.3127979464613129, "step": 1790}, {"loss": 1.0765, "grad_norm": 0.7469736337661743, "learning_rate": 0.0002, "epoch": 1.3201320132013201, "step": 1800}, {"loss": 1.0663, "grad_norm": 0.32272255420684814, "learning_rate": 0.0002, "epoch": 1.3274660799413276, "step": 1810}, {"loss": 0.9887, "grad_norm": 0.3534623086452484, "learning_rate": 0.0002, "epoch": 1.3348001466813348, "step": 1820}, {"loss": 1.1628, "grad_norm": 0.36127907037734985, "learning_rate": 0.0002, "epoch": 1.342134213421342, "step": 1830}, {"loss": 1.0972, "grad_norm": 0.4072401523590088, "learning_rate": 0.0002, "epoch": 1.3494682801613496, "step": 1840}, {"loss": 1.1267, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.3568023469013568, "step": 1850}, {"loss": 1.0173, "grad_norm": 0.412883460521698, "learning_rate": 0.0002, "epoch": 1.364136413641364, "step": 1860}, {"loss": 1.0265, "grad_norm": 0.3735875189304352, "learning_rate": 0.0002, "epoch": 1.3714704803813715, "step": 1870}, {"loss": 1.1061, "grad_norm": 0.39158159494400024, "learning_rate": 0.0002, "epoch": 1.3788045471213788, "step": 1880}, {"loss": 1.0433, "grad_norm": 0.44431769847869873, "learning_rate": 0.0002, "epoch": 1.386138613861386, "step": 1890}, {"loss": 1.0216, "grad_norm": 0.37772801518440247, "learning_rate": 0.0002, "epoch": 1.3934726806013935, "step": 1900}, {"loss": 1.0674, "grad_norm": 0.4056641757488251, "learning_rate": 0.0002, "epoch": 1.4008067473414008, "step": 1910}, {"loss": 1.0256, "grad_norm": 0.41612377762794495, "learning_rate": 0.0002, "epoch": 1.408140814081408, "step": 1920}, {"loss": 1.0467, "grad_norm": 0.41153013706207275, "learning_rate": 0.0002, "epoch": 1.4154748808214155, "step": 1930}, {"loss": 1.1062, "grad_norm": 0.387845516204834, "learning_rate": 0.0002, "epoch": 1.4228089475614227, "step": 1940}, {"loss": 1.1094, "grad_norm": 0.3809587061405182, "learning_rate": 0.0002, "epoch": 1.4301430143014302, "step": 1950}, {"loss": 1.0461, "grad_norm": 0.3625726103782654, "learning_rate": 0.0002, "epoch": 1.4374770810414375, "step": 1960}, {"loss": 0.9983, "grad_norm": 0.5294290781021118, "learning_rate": 0.0002, "epoch": 1.444811147781445, "step": 1970}, {"loss": 1.1114, "grad_norm": 0.39975494146347046, "learning_rate": 0.0002, "epoch": 1.4521452145214522, "step": 1980}, {"loss": 0.9704, "grad_norm": 0.4181167185306549, "learning_rate": 0.0002, "epoch": 1.4594792812614594, "step": 1990}, {"loss": 1.1146, "grad_norm": 0.42001503705978394, "learning_rate": 0.0002, "epoch": 1.466813348001467, "step": 2000}, {"loss": 1.1266, "grad_norm": 0.4877578616142273, "learning_rate": 0.0002, "epoch": 1.4741474147414741, "step": 2010}, {"loss": 1.1012, "grad_norm": 0.4050969183444977, "learning_rate": 0.0002, "epoch": 1.4814814814814814, "step": 2020}, {"loss": 1.0562, "grad_norm": 0.39068883657455444, "learning_rate": 0.0002, "epoch": 1.4888155482214889, "step": 2030}, {"loss": 1.0464, "grad_norm": 0.421282559633255, "learning_rate": 0.0002, "epoch": 1.4961496149614961, "step": 2040}, {"loss": 1.0532, "grad_norm": 0.47092297673225403, "learning_rate": 0.0002, "epoch": 1.5034836817015034, "step": 2050}, {"loss": 0.9348, "grad_norm": 0.39688974618911743, "learning_rate": 0.0002, "epoch": 1.5108177484415108, "step": 2060}, {"loss": 1.08, "grad_norm": 0.5529879331588745, "learning_rate": 0.0002, "epoch": 1.5181518151815183, "step": 2070}, {"loss": 1.1836, "grad_norm": 0.4879782199859619, "learning_rate": 0.0002, "epoch": 1.5254858819215253, "step": 2080}, {"loss": 1.0432, "grad_norm": 0.5517361164093018, "learning_rate": 0.0002, "epoch": 1.5328199486615328, "step": 2090}, {"loss": 1.0433, "grad_norm": 0.44015637040138245, "learning_rate": 0.0002, "epoch": 1.5401540154015403, "step": 2100}, {"loss": 1.1873, "grad_norm": 0.5435167551040649, "learning_rate": 0.0002, "epoch": 1.5474880821415475, "step": 2110}, {"loss": 1.1076, "grad_norm": 0.5714033246040344, "learning_rate": 0.0002, "epoch": 1.5548221488815548, "step": 2120}, {"loss": 1.1107, "grad_norm": 0.31732529401779175, "learning_rate": 0.0002, "epoch": 1.5621562156215623, "step": 2130}, {"loss": 1.0817, "grad_norm": 0.49068278074264526, "learning_rate": 0.0002, "epoch": 1.5694902823615695, "step": 2140}, {"loss": 1.0254, "grad_norm": 0.46851542592048645, "learning_rate": 0.0002, "epoch": 1.5768243491015768, "step": 2150}, {"loss": 1.0623, "grad_norm": 0.5083092451095581, "learning_rate": 0.0002, "epoch": 1.5841584158415842, "step": 2160}, {"loss": 1.0603, "grad_norm": 0.9822936058044434, "learning_rate": 0.0002, "epoch": 1.5914924825815915, "step": 2170}, {"loss": 0.9986, "grad_norm": 0.4575989246368408, "learning_rate": 0.0002, "epoch": 1.5988265493215987, "step": 2180}, {"loss": 1.1292, "grad_norm": 0.47444286942481995, "learning_rate": 0.0002, "epoch": 1.6061606160616062, "step": 2190}, {"loss": 1.0136, "grad_norm": 0.7208226919174194, "learning_rate": 0.0002, "epoch": 1.6134946828016135, "step": 2200}, {"loss": 1.15, "grad_norm": 0.43791481852531433, "learning_rate": 0.0002, "epoch": 1.6208287495416207, "step": 2210}, {"loss": 1.0961, "grad_norm": 0.5245792865753174, "learning_rate": 0.0002, "epoch": 1.6281628162816282, "step": 2220}, {"loss": 0.9957, "grad_norm": 0.39289429783821106, "learning_rate": 0.0002, "epoch": 1.6354968830216357, "step": 2230}, {"loss": 1.133, "grad_norm": 0.6106135845184326, "learning_rate": 0.0002, "epoch": 1.6428309497616427, "step": 2240}, {"loss": 1.0129, "grad_norm": 0.3722580671310425, "learning_rate": 0.0002, "epoch": 1.6501650165016502, "step": 2250}, {"loss": 1.0446, "grad_norm": 0.3649403750896454, "learning_rate": 0.0002, "epoch": 1.6574990832416576, "step": 2260}, {"loss": 1.0037, "grad_norm": 0.46514248847961426, "learning_rate": 0.0002, "epoch": 1.6648331499816649, "step": 2270}, {"loss": 1.0022, "grad_norm": 0.42034927010536194, "learning_rate": 0.0002, "epoch": 1.6721672167216721, "step": 2280}, {"loss": 1.1362, "grad_norm": 0.45202910900115967, "learning_rate": 0.0002, "epoch": 1.6795012834616796, "step": 2290}, {"loss": 1.0866, "grad_norm": 0.36257603764533997, "learning_rate": 0.0002, "epoch": 1.6868353502016868, "step": 2300}, {"loss": 1.0973, "grad_norm": 0.6340323090553284, "learning_rate": 0.0002, "epoch": 1.694169416941694, "step": 2310}, {"loss": 1.0615, "grad_norm": 0.4352878928184509, "learning_rate": 0.0002, "epoch": 1.7015034836817016, "step": 2320}, {"loss": 1.0629, "grad_norm": 0.45029792189598083, "learning_rate": 0.0002, "epoch": 1.7088375504217088, "step": 2330}, {"loss": 0.9621, "grad_norm": 0.3891315758228302, "learning_rate": 0.0002, "epoch": 1.716171617161716, "step": 2340}, {"loss": 0.9779, "grad_norm": 0.35180050134658813, "learning_rate": 0.0002, "epoch": 1.7235056839017235, "step": 2350}, {"loss": 1.0368, "grad_norm": 0.42367449402809143, "learning_rate": 0.0002, "epoch": 1.7308397506417308, "step": 2360}, {"loss": 1.0376, "grad_norm": 0.4553675353527069, "learning_rate": 0.0002, "epoch": 1.738173817381738, "step": 2370}, {"loss": 1.1467, "grad_norm": 0.5944654941558838, "learning_rate": 0.0002, "epoch": 1.7455078841217455, "step": 2380}, {"loss": 1.0548, "grad_norm": 0.3479664623737335, "learning_rate": 0.0002, "epoch": 1.752841950861753, "step": 2390}, {"loss": 1.0798, "grad_norm": 0.3585502505302429, "learning_rate": 0.0002, "epoch": 1.76017601760176, "step": 2400}, {"loss": 1.0983, "grad_norm": 0.4263346493244171, "learning_rate": 0.0002, "epoch": 1.7675100843417675, "step": 2410}, {"loss": 1.054, "grad_norm": 0.5476409196853638, "learning_rate": 0.0002, "epoch": 1.774844151081775, "step": 2420}, {"loss": 1.1615, "grad_norm": 0.3694186508655548, "learning_rate": 0.0002, "epoch": 1.7821782178217822, "step": 2430}, {"loss": 1.1343, "grad_norm": 0.9185658693313599, "learning_rate": 0.0002, "epoch": 1.7895122845617895, "step": 2440}, {"loss": 1.0764, "grad_norm": 0.7171908020973206, "learning_rate": 0.0002, "epoch": 1.796846351301797, "step": 2450}, {"loss": 1.1154, "grad_norm": 0.550658643245697, "learning_rate": 0.0002, "epoch": 1.8041804180418042, "step": 2460}, {"loss": 0.9975, "grad_norm": 0.4075568914413452, "learning_rate": 0.0002, "epoch": 1.8115144847818114, "step": 2470}, {"loss": 1.0935, "grad_norm": 0.3790127635002136, "learning_rate": 0.0002, "epoch": 1.818848551521819, "step": 2480}, {"loss": 0.9839, "grad_norm": 0.3576384484767914, "learning_rate": 0.0002, "epoch": 1.8261826182618262, "step": 2490}, {"loss": 1.1369, "grad_norm": 0.3919370770454407, "learning_rate": 0.0002, "epoch": 1.8335166850018334, "step": 2500}, {"loss": 0.9985, "grad_norm": 0.485083669424057, "learning_rate": 0.0002, "epoch": 1.8408507517418409, "step": 2510}, {"loss": 1.1585, "grad_norm": 0.4564347565174103, "learning_rate": 0.0002, "epoch": 1.8481848184818483, "step": 2520}, {"loss": 1.0944, "grad_norm": 0.3613106608390808, "learning_rate": 0.0002, "epoch": 1.8555188852218554, "step": 2530}, {"loss": 1.0819, "grad_norm": 0.39600759744644165, "learning_rate": 0.0002, "epoch": 1.8628529519618628, "step": 2540}, {"loss": 0.9453, "grad_norm": 1.123499870300293, "learning_rate": 0.0002, "epoch": 1.8701870187018703, "step": 2550}, {"loss": 1.0635, "grad_norm": 0.4612680673599243, "learning_rate": 0.0002, "epoch": 1.8775210854418776, "step": 2560}, {"loss": 1.0087, "grad_norm": 0.42745399475097656, "learning_rate": 0.0002, "epoch": 1.8848551521818848, "step": 2570}, {"loss": 1.0102, "grad_norm": 0.4055580198764801, "learning_rate": 0.0002, "epoch": 1.8921892189218923, "step": 2580}, {"loss": 1.0177, "grad_norm": 0.44174644351005554, "learning_rate": 0.0002, "epoch": 1.8995232856618995, "step": 2590}, {"loss": 0.9886, "grad_norm": 1.0228385925292969, "learning_rate": 0.0002, "epoch": 1.9068573524019068, "step": 2600}, {"loss": 1.0857, "grad_norm": 0.3496396243572235, "learning_rate": 0.0002, "epoch": 1.9141914191419143, "step": 2610}, {"loss": 1.0955, "grad_norm": 0.4191173017024994, "learning_rate": 0.0002, "epoch": 1.9215254858819215, "step": 2620}, {"loss": 1.0943, "grad_norm": 0.6778554916381836, "learning_rate": 0.0002, "epoch": 1.9288595526219288, "step": 2630}, {"loss": 1.0594, "grad_norm": 0.41992834210395813, "learning_rate": 0.0002, "epoch": 1.9361936193619362, "step": 2640}, {"loss": 1.1159, "grad_norm": 0.8760401010513306, "learning_rate": 0.0002, "epoch": 1.9435276861019435, "step": 2650}, {"loss": 1.0379, "grad_norm": 0.44049209356307983, "learning_rate": 0.0002, "epoch": 1.9508617528419507, "step": 2660}, {"loss": 1.1008, "grad_norm": 0.5651928782463074, "learning_rate": 0.0002, "epoch": 1.9581958195819582, "step": 2670}, {"loss": 1.1317, "grad_norm": 0.5292727947235107, "learning_rate": 0.0002, "epoch": 1.9655298863219657, "step": 2680}, {"loss": 1.1328, "grad_norm": 0.6012240648269653, "learning_rate": 0.0002, "epoch": 1.9728639530619727, "step": 2690}, {"loss": 1.0683, "grad_norm": 0.3945149779319763, "learning_rate": 0.0002, "epoch": 1.9801980198019802, "step": 2700}, {"loss": 1.0155, "grad_norm": 0.5732627511024475, "learning_rate": 0.0002, "epoch": 1.9875320865419877, "step": 2710}, {"loss": 0.9857, "grad_norm": 0.3963361084461212, "learning_rate": 0.0002, "epoch": 1.994866153281995, "step": 2720}, {"eval_loss": 1.1534006595611572, "eval_runtime": 32.7541, "eval_samples_per_second": 13.159, "eval_steps_per_second": 1.649, "epoch": 2.0, "step": 2727}, {"loss": 0.9624, "grad_norm": 0.48628315329551697, "learning_rate": 0.0002, "epoch": 2.002200220022002, "step": 2730}, {"loss": 0.9603, "grad_norm": 0.413875013589859, "learning_rate": 0.0002, "epoch": 2.0095342867620096, "step": 2740}, {"loss": 0.965, "grad_norm": 0.4988735616207123, "learning_rate": 0.0002, "epoch": 2.0168683535020167, "step": 2750}, {"loss": 0.9677, "grad_norm": 0.5634812712669373, "learning_rate": 0.0002, "epoch": 2.024202420242024, "step": 2760}, {"loss": 0.9547, "grad_norm": 0.48302653431892395, "learning_rate": 0.0002, "epoch": 2.0315364869820316, "step": 2770}, {"loss": 0.9346, "grad_norm": 0.49914175271987915, "learning_rate": 0.0002, "epoch": 2.038870553722039, "step": 2780}, {"loss": 0.904, "grad_norm": 1.14039945602417, "learning_rate": 0.0002, "epoch": 2.046204620462046, "step": 2790}, {"loss": 0.9588, "grad_norm": 0.6359720826148987, "learning_rate": 0.0002, "epoch": 2.0535386872020536, "step": 2800}, {"loss": 0.9031, "grad_norm": 0.4589158296585083, "learning_rate": 0.0002, "epoch": 2.060872753942061, "step": 2810}, {"loss": 0.9438, "grad_norm": 0.46255481243133545, "learning_rate": 0.0002, "epoch": 2.068206820682068, "step": 2820}, {"loss": 0.9464, "grad_norm": 0.6232137680053711, "learning_rate": 0.0002, "epoch": 2.0755408874220755, "step": 2830}, {"loss": 0.8978, "grad_norm": 0.41042178869247437, "learning_rate": 0.0002, "epoch": 2.082874954162083, "step": 2840}, {"loss": 0.8516, "grad_norm": 0.5334428548812866, "learning_rate": 0.0002, "epoch": 2.09020902090209, "step": 2850}, {"loss": 0.9313, "grad_norm": 0.8270058631896973, "learning_rate": 0.0002, "epoch": 2.0975430876420975, "step": 2860}, {"loss": 1.0064, "grad_norm": 0.6624533534049988, "learning_rate": 0.0002, "epoch": 2.104877154382105, "step": 2870}, {"loss": 0.9196, "grad_norm": 0.5448863506317139, "learning_rate": 0.0002, "epoch": 2.112211221122112, "step": 2880}, {"loss": 0.887, "grad_norm": 0.621482789516449, "learning_rate": 0.0002, "epoch": 2.1195452878621195, "step": 2890}, {"loss": 0.9702, "grad_norm": 0.4556255340576172, "learning_rate": 0.0002, "epoch": 2.126879354602127, "step": 2900}, {"loss": 0.9323, "grad_norm": 0.4620579183101654, "learning_rate": 0.0002, "epoch": 2.1342134213421344, "step": 2910}, {"loss": 0.836, "grad_norm": 0.9602415561676025, "learning_rate": 0.0002, "epoch": 2.1415474880821415, "step": 2920}, {"loss": 0.8826, "grad_norm": 0.587943971157074, "learning_rate": 0.0002, "epoch": 2.148881554822149, "step": 2930}, {"loss": 0.971, "grad_norm": 0.5121372938156128, "learning_rate": 0.0002, "epoch": 2.1562156215621564, "step": 2940}, {"loss": 0.8751, "grad_norm": 0.49424484372138977, "learning_rate": 0.0002, "epoch": 2.1635496883021634, "step": 2950}, {"loss": 0.8674, "grad_norm": 0.6312560439109802, "learning_rate": 0.0002, "epoch": 2.170883755042171, "step": 2960}, {"loss": 0.9791, "grad_norm": 0.5235576629638672, "learning_rate": 0.0002, "epoch": 2.1782178217821784, "step": 2970}, {"loss": 0.9706, "grad_norm": 0.5868439674377441, "learning_rate": 0.0002, "epoch": 2.1855518885221854, "step": 2980}, {"loss": 0.9338, "grad_norm": 0.42302873730659485, "learning_rate": 0.0002, "epoch": 2.192885955262193, "step": 2990}, {"loss": 0.9332, "grad_norm": 0.5097725987434387, "learning_rate": 0.0002, "epoch": 2.2002200220022003, "step": 3000}, {"loss": 0.9239, "grad_norm": 0.5091572403907776, "learning_rate": 0.0002, "epoch": 2.2075540887422074, "step": 3010}, {"loss": 0.8898, "grad_norm": 0.49433162808418274, "learning_rate": 0.0002, "epoch": 2.214888155482215, "step": 3020}, {"loss": 0.9734, "grad_norm": 0.5577368140220642, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3030}, {"loss": 0.9033, "grad_norm": 0.6177583932876587, "learning_rate": 0.0002, "epoch": 2.2295562889622293, "step": 3040}, {"loss": 0.9882, "grad_norm": 0.5256719589233398, "learning_rate": 0.0002, "epoch": 2.236890355702237, "step": 3050}, {"loss": 0.9439, "grad_norm": 0.5001118183135986, "learning_rate": 0.0002, "epoch": 2.2442244224422443, "step": 3060}, {"loss": 0.8718, "grad_norm": 0.5721249580383301, "learning_rate": 0.0002, "epoch": 2.2515584891822513, "step": 3070}, {"loss": 1.0648, "grad_norm": 0.5325384140014648, "learning_rate": 0.0002, "epoch": 2.258892555922259, "step": 3080}, {"loss": 0.9843, "grad_norm": 0.5719189047813416, "learning_rate": 0.0002, "epoch": 2.2662266226622663, "step": 3090}, {"loss": 0.8633, "grad_norm": 0.6337835788726807, "learning_rate": 0.0002, "epoch": 2.2735606894022737, "step": 3100}, {"loss": 0.9962, "grad_norm": 0.5381836891174316, "learning_rate": 0.0002, "epoch": 2.2808947561422808, "step": 3110}, {"loss": 0.8265, "grad_norm": 0.5408531427383423, "learning_rate": 0.0002, "epoch": 2.2882288228822882, "step": 3120}, {"loss": 1.0325, "grad_norm": 0.43705281615257263, "learning_rate": 0.0002, "epoch": 2.2955628896222957, "step": 3130}, {"loss": 0.9388, "grad_norm": 0.6454030275344849, "learning_rate": 0.0002, "epoch": 2.3028969563623027, "step": 3140}, {"loss": 0.954, "grad_norm": 0.686030387878418, "learning_rate": 0.0002, "epoch": 2.31023102310231, "step": 3150}, {"loss": 0.9403, "grad_norm": 0.5123633146286011, "learning_rate": 0.0002, "epoch": 2.3175650898423177, "step": 3160}, {"loss": 0.8834, "grad_norm": 0.842506468296051, "learning_rate": 0.0002, "epoch": 2.3248991565823247, "step": 3170}, {"loss": 1.0497, "grad_norm": 0.5193818807601929, "learning_rate": 0.0002, "epoch": 2.332233223322332, "step": 3180}, {"loss": 0.9473, "grad_norm": 0.5634409189224243, "learning_rate": 0.0002, "epoch": 2.3395672900623397, "step": 3190}, {"loss": 0.8499, "grad_norm": 0.6475534439086914, "learning_rate": 0.0002, "epoch": 2.3469013568023467, "step": 3200}, {"loss": 0.874, "grad_norm": 1.1503914594650269, "learning_rate": 0.0002, "epoch": 2.354235423542354, "step": 3210}, {"loss": 0.9762, "grad_norm": 0.7234905362129211, "learning_rate": 0.0002, "epoch": 2.3615694902823616, "step": 3220}, {"loss": 0.9007, "grad_norm": 0.664903461933136, "learning_rate": 0.0002, "epoch": 2.368903557022369, "step": 3230}, {"loss": 0.9987, "grad_norm": 0.5453006625175476, "learning_rate": 0.0002, "epoch": 2.376237623762376, "step": 3240}, {"loss": 0.9742, "grad_norm": 0.6256654262542725, "learning_rate": 0.0002, "epoch": 2.3835716905023836, "step": 3250}, {"loss": 0.9922, "grad_norm": 0.5166565179824829, "learning_rate": 0.0002, "epoch": 2.390905757242391, "step": 3260}, {"loss": 0.927, "grad_norm": 0.5699098110198975, "learning_rate": 0.0002, "epoch": 2.398239823982398, "step": 3270}, {"loss": 0.8878, "grad_norm": 0.4472540020942688, "learning_rate": 0.0002, "epoch": 2.4055738907224056, "step": 3280}, {"loss": 0.9439, "grad_norm": 0.6790403127670288, "learning_rate": 0.0002, "epoch": 2.412907957462413, "step": 3290}, {"loss": 0.972, "grad_norm": 0.5182185173034668, "learning_rate": 0.0002, "epoch": 2.42024202420242, "step": 3300}, {"loss": 0.9775, "grad_norm": 0.564647912979126, "learning_rate": 0.0002, "epoch": 2.4275760909424275, "step": 3310}, {"loss": 1.072, "grad_norm": 0.5625313520431519, "learning_rate": 0.0002, "epoch": 2.434910157682435, "step": 3320}, {"loss": 0.8798, "grad_norm": 0.7496559619903564, "learning_rate": 0.0002, "epoch": 2.442244224422442, "step": 3330}, {"loss": 0.868, "grad_norm": 0.4779128134250641, "learning_rate": 0.0002, "epoch": 2.4495782911624495, "step": 3340}, {"loss": 1.0316, "grad_norm": 0.578093409538269, "learning_rate": 0.0002, "epoch": 2.456912357902457, "step": 3350}, {"loss": 0.9282, "grad_norm": 0.5456080436706543, "learning_rate": 0.0002, "epoch": 2.4642464246424645, "step": 3360}, {"loss": 0.8409, "grad_norm": 0.4769273102283478, "learning_rate": 0.0002, "epoch": 2.4715804913824715, "step": 3370}, {"loss": 0.9312, "grad_norm": 0.5608189702033997, "learning_rate": 0.0002, "epoch": 2.478914558122479, "step": 3380}, {"loss": 0.9934, "grad_norm": 0.5590165853500366, "learning_rate": 0.0002, "epoch": 2.4862486248624864, "step": 3390}, {"loss": 1.025, "grad_norm": 0.801306962966919, "learning_rate": 0.0002, "epoch": 2.4935826916024935, "step": 3400}, {"loss": 0.9049, "grad_norm": 0.6045624613761902, "learning_rate": 0.0002, "epoch": 2.500916758342501, "step": 3410}, {"loss": 0.944, "grad_norm": 0.5735858082771301, "learning_rate": 0.0002, "epoch": 2.5082508250825084, "step": 3420}, {"loss": 0.9846, "grad_norm": 0.6827309131622314, "learning_rate": 0.0002, "epoch": 2.5155848918225154, "step": 3430}, {"loss": 0.9789, "grad_norm": 0.5702602863311768, "learning_rate": 0.0002, "epoch": 2.522918958562523, "step": 3440}, {"loss": 0.9127, "grad_norm": 0.6674721240997314, "learning_rate": 0.0002, "epoch": 2.5302530253025304, "step": 3450}, {"loss": 0.914, "grad_norm": 0.5635907649993896, "learning_rate": 0.0002, "epoch": 2.5375870920425374, "step": 3460}, {"loss": 0.8398, "grad_norm": 0.42737770080566406, "learning_rate": 0.0002, "epoch": 2.544921158782545, "step": 3470}, {"loss": 0.9474, "grad_norm": 0.6720691919326782, "learning_rate": 0.0002, "epoch": 2.5522552255225524, "step": 3480}, {"loss": 0.8637, "grad_norm": 0.8917084336280823, "learning_rate": 0.0002, "epoch": 2.55958929226256, "step": 3490}, {"loss": 0.9257, "grad_norm": 0.5134549140930176, "learning_rate": 0.0002, "epoch": 2.566923359002567, "step": 3500}, {"loss": 0.9362, "grad_norm": 0.4951367974281311, "learning_rate": 0.0002, "epoch": 2.5742574257425743, "step": 3510}, {"loss": 0.9184, "grad_norm": 0.9438204765319824, "learning_rate": 0.0002, "epoch": 2.5815914924825814, "step": 3520}, {"loss": 0.8939, "grad_norm": 0.6024714708328247, "learning_rate": 0.0002, "epoch": 2.588925559222589, "step": 3530}, {"loss": 0.9298, "grad_norm": 0.5248535871505737, "learning_rate": 0.0002, "epoch": 2.5962596259625963, "step": 3540}, {"loss": 0.941, "grad_norm": 0.8677568435668945, "learning_rate": 0.0002, "epoch": 2.6035936927026038, "step": 3550}, {"loss": 0.9253, "grad_norm": 0.82008296251297, "learning_rate": 0.0002, "epoch": 2.610927759442611, "step": 3560}, {"loss": 0.8429, "grad_norm": 0.4724634885787964, "learning_rate": 0.0002, "epoch": 2.6182618261826183, "step": 3570}, {"loss": 0.9058, "grad_norm": 0.5434244275093079, "learning_rate": 0.0002, "epoch": 2.6255958929226257, "step": 3580}, {"loss": 0.9379, "grad_norm": 0.4948740005493164, "learning_rate": 0.0002, "epoch": 2.6329299596626328, "step": 3590}, {"loss": 0.8718, "grad_norm": 0.42109328508377075, "learning_rate": 0.0002, "epoch": 2.6402640264026402, "step": 3600}, {"loss": 0.9809, "grad_norm": 0.7979786396026611, "learning_rate": 0.0002, "epoch": 2.6475980931426477, "step": 3610}, {"loss": 0.9229, "grad_norm": 0.6345919370651245, "learning_rate": 0.0002, "epoch": 2.654932159882655, "step": 3620}, {"loss": 0.8506, "grad_norm": 0.4971671402454376, "learning_rate": 0.0002, "epoch": 2.662266226622662, "step": 3630}, {"loss": 0.8054, "grad_norm": 0.6467748284339905, "learning_rate": 0.0002, "epoch": 2.6696002933626697, "step": 3640}, {"loss": 0.9277, "grad_norm": 0.4240160286426544, "learning_rate": 0.0002, "epoch": 2.6769343601026767, "step": 3650}, {"loss": 0.8213, "grad_norm": 0.5179754495620728, "learning_rate": 0.0002, "epoch": 2.684268426842684, "step": 3660}, {"loss": 0.9221, "grad_norm": 0.754012405872345, "learning_rate": 0.0002, "epoch": 2.6916024935826917, "step": 3670}, {"loss": 0.9194, "grad_norm": 0.5141299962997437, "learning_rate": 0.0002, "epoch": 2.698936560322699, "step": 3680}, {"loss": 0.9495, "grad_norm": 0.5737819075584412, "learning_rate": 0.0002, "epoch": 2.706270627062706, "step": 3690}, {"loss": 1.0162, "grad_norm": 0.5887577533721924, "learning_rate": 0.0002, "epoch": 2.7136046938027136, "step": 3700}, {"loss": 0.9169, "grad_norm": 0.6740471720695496, "learning_rate": 0.0002, "epoch": 2.720938760542721, "step": 3710}, {"loss": 0.9297, "grad_norm": 0.5879453420639038, "learning_rate": 0.0002, "epoch": 2.728272827282728, "step": 3720}, {"loss": 0.9358, "grad_norm": 0.4858354926109314, "learning_rate": 0.0002, "epoch": 2.7356068940227356, "step": 3730}, {"loss": 0.9308, "grad_norm": 0.5489001870155334, "learning_rate": 0.0002, "epoch": 2.742940960762743, "step": 3740}, {"loss": 0.894, "grad_norm": 0.8187092542648315, "learning_rate": 0.0002, "epoch": 2.7502750275027505, "step": 3750}, {"loss": 0.8954, "grad_norm": 0.5666626691818237, "learning_rate": 0.0002, "epoch": 2.7576090942427576, "step": 3760}, {"loss": 1.0059, "grad_norm": 0.5377066135406494, "learning_rate": 0.0002, "epoch": 2.764943160982765, "step": 3770}, {"loss": 0.9132, "grad_norm": 0.566330075263977, "learning_rate": 0.0002, "epoch": 2.772277227722772, "step": 3780}, {"loss": 0.9415, "grad_norm": 0.5522832870483398, "learning_rate": 0.0002, "epoch": 2.7796112944627795, "step": 3790}, {"loss": 0.8816, "grad_norm": 0.5668695569038391, "learning_rate": 0.0002, "epoch": 2.786945361202787, "step": 3800}, {"loss": 0.8885, "grad_norm": 0.7566602826118469, "learning_rate": 0.0002, "epoch": 2.7942794279427945, "step": 3810}, {"loss": 0.8598, "grad_norm": 0.5603684782981873, "learning_rate": 0.0002, "epoch": 2.8016134946828015, "step": 3820}, {"loss": 0.9602, "grad_norm": 0.49122217297554016, "learning_rate": 0.0002, "epoch": 2.808947561422809, "step": 3830}, {"loss": 0.9738, "grad_norm": 0.6798251867294312, "learning_rate": 0.0002, "epoch": 2.816281628162816, "step": 3840}, {"loss": 0.9533, "grad_norm": 0.6097991466522217, "learning_rate": 0.0002, "epoch": 2.8236156949028235, "step": 3850}, {"loss": 0.8672, "grad_norm": 0.6675726175308228, "learning_rate": 0.0002, "epoch": 2.830949761642831, "step": 3860}, {"loss": 0.9324, "grad_norm": 0.9223952889442444, "learning_rate": 0.0002, "epoch": 2.8382838283828384, "step": 3870}, {"loss": 0.8767, "grad_norm": 0.6020799875259399, "learning_rate": 0.0002, "epoch": 2.8456178951228455, "step": 3880}, {"loss": 0.9148, "grad_norm": 0.5206381678581238, "learning_rate": 0.0002, "epoch": 2.852951961862853, "step": 3890}, {"loss": 0.9479, "grad_norm": 0.6268777251243591, "learning_rate": 0.0002, "epoch": 2.8602860286028604, "step": 3900}, {"loss": 0.9409, "grad_norm": 1.1583497524261475, "learning_rate": 0.0002, "epoch": 2.8676200953428674, "step": 3910}, {"loss": 0.895, "grad_norm": 0.7263903021812439, "learning_rate": 0.0002, "epoch": 2.874954162082875, "step": 3920}, {"loss": 0.8786, "grad_norm": 0.5369910001754761, "learning_rate": 0.0002, "epoch": 2.8822882288228824, "step": 3930}, {"loss": 1.0015, "grad_norm": 0.7298350930213928, "learning_rate": 0.0002, "epoch": 2.88962229556289, "step": 3940}, {"loss": 0.979, "grad_norm": 0.577012836933136, "learning_rate": 0.0002, "epoch": 2.896956362302897, "step": 3950}, {"loss": 0.9716, "grad_norm": 0.5859594345092773, "learning_rate": 0.0002, "epoch": 2.9042904290429044, "step": 3960}, {"loss": 0.8772, "grad_norm": 0.47176122665405273, "learning_rate": 0.0002, "epoch": 2.9116244957829114, "step": 3970}, {"loss": 0.8997, "grad_norm": 0.9699620604515076, "learning_rate": 0.0002, "epoch": 2.918958562522919, "step": 3980}, {"loss": 0.9057, "grad_norm": 0.7908747792243958, "learning_rate": 0.0002, "epoch": 2.9262926292629263, "step": 3990}, {"loss": 0.9462, "grad_norm": 0.5777379274368286, "learning_rate": 0.0002, "epoch": 2.933626696002934, "step": 4000}, {"loss": 0.9358, "grad_norm": 0.599288284778595, "learning_rate": 0.0002, "epoch": 2.940960762742941, "step": 4010}, {"loss": 0.9812, "grad_norm": 0.5232274532318115, "learning_rate": 0.0002, "epoch": 2.9482948294829483, "step": 4020}, {"loss": 0.96, "grad_norm": 0.6395137310028076, "learning_rate": 0.0002, "epoch": 2.9556288962229558, "step": 4030}, {"loss": 0.9813, "grad_norm": 0.589260458946228, "learning_rate": 0.0002, "epoch": 2.962962962962963, "step": 4040}, {"loss": 0.9541, "grad_norm": 0.5699581503868103, "learning_rate": 0.0002, "epoch": 2.9702970297029703, "step": 4050}, {"loss": 0.9585, "grad_norm": 0.528468132019043, "learning_rate": 0.0002, "epoch": 2.9776310964429777, "step": 4060}, {"loss": 0.9164, "grad_norm": 0.4804670512676239, "learning_rate": 0.0002, "epoch": 2.984965163182985, "step": 4070}, {"loss": 0.9771, "grad_norm": 1.1918889284133911, "learning_rate": 0.0002, "epoch": 2.9922992299229922, "step": 4080}, {"loss": 0.9178, "grad_norm": 0.5479103326797485, "learning_rate": 0.0002, "epoch": 2.9996332966629997, "step": 4090}, {"eval_loss": 1.1642853021621704, "eval_runtime": 32.7511, "eval_samples_per_second": 13.16, "eval_steps_per_second": 1.649, "epoch": 2.9996332966629997, "step": 4090}, {"loss": 0.7981, "grad_norm": 0.7430027723312378, "learning_rate": 0.0002, "epoch": 3.006967363403007, "step": 4100}, {"loss": 0.7871, "grad_norm": 0.6293647289276123, "learning_rate": 0.0002, "epoch": 3.014301430143014, "step": 4110}, {"loss": 0.78, "grad_norm": 0.6191329956054688, "learning_rate": 0.0002, "epoch": 3.0216354968830217, "step": 4120}, {"loss": 0.7618, "grad_norm": 0.7959313988685608, "learning_rate": 0.0002, "epoch": 3.028969563623029, "step": 4130}, {"loss": 0.8039, "grad_norm": 0.5956351161003113, "learning_rate": 0.0002, "epoch": 3.036303630363036, "step": 4140}, {"loss": 0.7477, "grad_norm": 0.670383632183075, "learning_rate": 0.0002, "epoch": 3.0436376971030437, "step": 4150}, {"loss": 0.7984, "grad_norm": 0.6414518356323242, "learning_rate": 0.0002, "epoch": 3.050971763843051, "step": 4160}, {"loss": 0.7369, "grad_norm": 0.7928852438926697, "learning_rate": 0.0002, "epoch": 3.058305830583058, "step": 4170}, {"loss": 0.7914, "grad_norm": 0.6211121082305908, "learning_rate": 0.0002, "epoch": 3.0656398973230656, "step": 4180}, {"loss": 0.7365, "grad_norm": 0.6237057447433472, "learning_rate": 0.0002, "epoch": 3.072973964063073, "step": 4190}, {"loss": 0.702, "grad_norm": 0.6522233486175537, "learning_rate": 0.0002, "epoch": 3.08030803080308, "step": 4200}, {"loss": 0.7646, "grad_norm": 0.9396848678588867, "learning_rate": 0.0002, "epoch": 3.0876420975430876, "step": 4210}, {"loss": 0.7559, "grad_norm": 0.8003010749816895, "learning_rate": 0.0002, "epoch": 3.094976164283095, "step": 4220}, {"loss": 0.711, "grad_norm": 0.6733810305595398, "learning_rate": 0.0002, "epoch": 3.102310231023102, "step": 4230}, {"loss": 0.696, "grad_norm": 0.6365828514099121, "learning_rate": 0.0002, "epoch": 3.1096442977631096, "step": 4240}, {"loss": 0.8362, "grad_norm": 1.0805548429489136, "learning_rate": 0.0002, "epoch": 3.116978364503117, "step": 4250}, {"loss": 0.7651, "grad_norm": 0.7262141108512878, "learning_rate": 0.0002, "epoch": 3.1243124312431245, "step": 4260}, {"loss": 0.7304, "grad_norm": 0.5500539541244507, "learning_rate": 0.0002, "epoch": 3.1316464979831315, "step": 4270}, {"loss": 0.7721, "grad_norm": 0.793912947177887, "learning_rate": 0.0002, "epoch": 3.138980564723139, "step": 4280}, {"loss": 0.7708, "grad_norm": 1.2540518045425415, "learning_rate": 0.0002, "epoch": 3.1463146314631465, "step": 4290}, {"loss": 0.782, "grad_norm": 0.7020077705383301, "learning_rate": 0.0002, "epoch": 3.1536486982031535, "step": 4300}, {"loss": 0.7253, "grad_norm": 0.5111123323440552, "learning_rate": 0.0002, "epoch": 3.160982764943161, "step": 4310}, {"loss": 0.8159, "grad_norm": 0.7172090411186218, "learning_rate": 0.0002, "epoch": 3.1683168316831685, "step": 4320}, {"loss": 0.6962, "grad_norm": 0.6343168616294861, "learning_rate": 0.0002, "epoch": 3.1756508984231755, "step": 4330}, {"loss": 0.7938, "grad_norm": 0.9563672542572021, "learning_rate": 0.0002, "epoch": 3.182984965163183, "step": 4340}, {"loss": 0.7385, "grad_norm": 1.0225574970245361, "learning_rate": 0.0002, "epoch": 3.1903190319031904, "step": 4350}, {"loss": 0.8652, "grad_norm": 1.1633386611938477, "learning_rate": 0.0002, "epoch": 3.1976530986431975, "step": 4360}, {"loss": 0.7259, "grad_norm": 0.8915148973464966, "learning_rate": 0.0002, "epoch": 3.204987165383205, "step": 4370}, {"loss": 0.8061, "grad_norm": 0.9156812429428101, "learning_rate": 0.0002, "epoch": 3.2123212321232124, "step": 4380}, {"loss": 0.8189, "grad_norm": 0.6363258957862854, "learning_rate": 0.0002, "epoch": 3.21965529886322, "step": 4390}, {"loss": 0.7996, "grad_norm": 0.579099178314209, "learning_rate": 0.0002, "epoch": 3.226989365603227, "step": 4400}, {"loss": 0.8592, "grad_norm": 0.8778146505355835, "learning_rate": 0.0002, "epoch": 3.2343234323432344, "step": 4410}, {"loss": 0.8281, "grad_norm": 0.8356770873069763, "learning_rate": 0.0002, "epoch": 3.241657499083242, "step": 4420}, {"loss": 0.8484, "grad_norm": 0.702032208442688, "learning_rate": 0.0002, "epoch": 3.248991565823249, "step": 4430}, {"loss": 0.7227, "grad_norm": 0.6386539340019226, "learning_rate": 0.0002, "epoch": 3.2563256325632564, "step": 4440}, {"loss": 0.8374, "grad_norm": 0.7008408904075623, "learning_rate": 0.0002, "epoch": 3.263659699303264, "step": 4450}, {"loss": 0.7572, "grad_norm": 0.9556332230567932, "learning_rate": 0.0002, "epoch": 3.270993766043271, "step": 4460}, {"loss": 0.743, "grad_norm": 0.5667835474014282, "learning_rate": 0.0002, "epoch": 3.2783278327832783, "step": 4470}, {"loss": 0.8152, "grad_norm": 0.8239172697067261, "learning_rate": 0.0002, "epoch": 3.285661899523286, "step": 4480}, {"loss": 0.756, "grad_norm": 0.7045050859451294, "learning_rate": 0.0002, "epoch": 3.292995966263293, "step": 4490}, {"loss": 0.7655, "grad_norm": 0.7131434082984924, "learning_rate": 0.0002, "epoch": 3.3003300330033003, "step": 4500}, {"loss": 0.836, "grad_norm": 0.6924910545349121, "learning_rate": 0.0002, "epoch": 3.3076640997433078, "step": 4510}, {"loss": 0.736, "grad_norm": 0.8945356607437134, "learning_rate": 0.0002, "epoch": 3.3149981664833152, "step": 4520}, {"loss": 0.7575, "grad_norm": 0.6546903252601624, "learning_rate": 0.0002, "epoch": 3.3223322332233223, "step": 4530}, {"loss": 0.7893, "grad_norm": 0.8206679224967957, "learning_rate": 0.0002, "epoch": 3.3296662999633297, "step": 4540}, {"loss": 0.7502, "grad_norm": 0.6482203602790833, "learning_rate": 0.0002, "epoch": 3.3370003667033368, "step": 4550}, {"loss": 0.8172, "grad_norm": 0.7558760046958923, "learning_rate": 0.0002, "epoch": 3.3443344334433442, "step": 4560}, {"loss": 0.744, "grad_norm": 0.7794756889343262, "learning_rate": 0.0002, "epoch": 3.3516685001833517, "step": 4570}, {"loss": 0.7385, "grad_norm": 0.7382805943489075, "learning_rate": 0.0002, "epoch": 3.359002566923359, "step": 4580}, {"loss": 0.8511, "grad_norm": 0.5912511944770813, "learning_rate": 0.0002, "epoch": 3.366336633663366, "step": 4590}, {"loss": 0.8272, "grad_norm": 0.7444885969161987, "learning_rate": 0.0002, "epoch": 3.3736707004033737, "step": 4600}, {"loss": 0.7927, "grad_norm": 0.7354922890663147, "learning_rate": 0.0002, "epoch": 3.381004767143381, "step": 4610}, {"loss": 0.7183, "grad_norm": 0.7685934901237488, "learning_rate": 0.0002, "epoch": 3.388338833883388, "step": 4620}, {"loss": 0.7436, "grad_norm": 0.61041259765625, "learning_rate": 0.0002, "epoch": 3.3956729006233957, "step": 4630}, {"loss": 0.7661, "grad_norm": 0.6820451021194458, "learning_rate": 0.0002, "epoch": 3.403006967363403, "step": 4640}, {"loss": 0.8796, "grad_norm": 0.5819534063339233, "learning_rate": 0.0002, "epoch": 3.41034103410341, "step": 4650}, {"loss": 0.7314, "grad_norm": 0.705410897731781, "learning_rate": 0.0002, "epoch": 3.4176751008434176, "step": 4660}, {"loss": 0.7901, "grad_norm": 0.8052892088890076, "learning_rate": 0.0002, "epoch": 3.425009167583425, "step": 4670}, {"loss": 0.7298, "grad_norm": 0.7746483087539673, "learning_rate": 0.0002, "epoch": 3.432343234323432, "step": 4680}, {"loss": 0.7976, "grad_norm": 0.7713689804077148, "learning_rate": 0.0002, "epoch": 3.4396773010634396, "step": 4690}, {"loss": 0.7427, "grad_norm": 0.810371994972229, "learning_rate": 0.0002, "epoch": 3.447011367803447, "step": 4700}, {"loss": 0.7594, "grad_norm": 0.7702969312667847, "learning_rate": 0.0002, "epoch": 3.4543454345434546, "step": 4710}, {"loss": 0.7957, "grad_norm": 0.7069268822669983, "learning_rate": 0.0002, "epoch": 3.4616795012834616, "step": 4720}, {"loss": 0.8199, "grad_norm": 0.7640359401702881, "learning_rate": 0.0002, "epoch": 3.469013568023469, "step": 4730}, {"loss": 0.6875, "grad_norm": 0.8661707639694214, "learning_rate": 0.0002, "epoch": 3.4763476347634765, "step": 4740}, {"loss": 0.8528, "grad_norm": 0.9970282912254333, "learning_rate": 0.0002, "epoch": 3.4836817015034836, "step": 4750}, {"loss": 0.8462, "grad_norm": 0.5824355483055115, "learning_rate": 0.0002, "epoch": 3.491015768243491, "step": 4760}, {"loss": 0.851, "grad_norm": 1.3072649240493774, "learning_rate": 0.0002, "epoch": 3.4983498349834985, "step": 4770}, {"loss": 0.9101, "grad_norm": 0.873978316783905, "learning_rate": 0.0002, "epoch": 3.5056839017235055, "step": 4780}, {"loss": 0.7403, "grad_norm": 0.5526657104492188, "learning_rate": 0.0002, "epoch": 3.513017968463513, "step": 4790}, {"loss": 0.7921, "grad_norm": 0.790894627571106, "learning_rate": 0.0002, "epoch": 3.5203520352035205, "step": 4800}, {"loss": 0.831, "grad_norm": 0.8119630217552185, "learning_rate": 0.0002, "epoch": 3.5276861019435275, "step": 4810}, {"loss": 0.7351, "grad_norm": 0.633212149143219, "learning_rate": 0.0002, "epoch": 3.535020168683535, "step": 4820}, {"loss": 0.8505, "grad_norm": 0.703029990196228, "learning_rate": 0.0002, "epoch": 3.5423542354235424, "step": 4830}, {"loss": 0.7204, "grad_norm": 0.7603771686553955, "learning_rate": 0.0002, "epoch": 3.54968830216355, "step": 4840}, {"loss": 0.8868, "grad_norm": 0.6260480880737305, "learning_rate": 0.0002, "epoch": 3.557022368903557, "step": 4850}, {"loss": 0.8137, "grad_norm": 0.8203664422035217, "learning_rate": 0.0002, "epoch": 3.5643564356435644, "step": 4860}, {"loss": 0.8821, "grad_norm": 0.7793813347816467, "learning_rate": 0.0002, "epoch": 3.5716905023835714, "step": 4870}, {"loss": 0.8164, "grad_norm": 0.7667397260665894, "learning_rate": 0.0002, "epoch": 3.579024569123579, "step": 4880}, {"loss": 0.7597, "grad_norm": 0.8198829889297485, "learning_rate": 0.0002, "epoch": 3.5863586358635864, "step": 4890}, {"loss": 0.7027, "grad_norm": 0.7689233422279358, "learning_rate": 0.0002, "epoch": 3.593692702603594, "step": 4900}, {"loss": 0.804, "grad_norm": 0.7870983481407166, "learning_rate": 0.0002, "epoch": 3.601026769343601, "step": 4910}, {"loss": 0.8269, "grad_norm": 0.8133853077888489, "learning_rate": 0.0002, "epoch": 3.6083608360836084, "step": 4920}, {"loss": 0.8515, "grad_norm": 1.308401346206665, "learning_rate": 0.0002, "epoch": 3.615694902823616, "step": 4930}, {"loss": 0.8494, "grad_norm": 0.7131121754646301, "learning_rate": 0.0002, "epoch": 3.623028969563623, "step": 4940}, {"loss": 0.7235, "grad_norm": 0.6825910210609436, "learning_rate": 0.0002, "epoch": 3.6303630363036303, "step": 4950}, {"loss": 0.7824, "grad_norm": 0.7254678606987, "learning_rate": 0.0002, "epoch": 3.637697103043638, "step": 4960}, {"loss": 0.7983, "grad_norm": 0.8045085072517395, "learning_rate": 0.0002, "epoch": 3.6450311697836453, "step": 4970}, {"loss": 0.8223, "grad_norm": 0.6991777420043945, "learning_rate": 0.0002, "epoch": 3.6523652365236523, "step": 4980}, {"loss": 0.7806, "grad_norm": 0.7804713249206543, "learning_rate": 0.0002, "epoch": 3.6596993032636598, "step": 4990}, {"loss": 0.8402, "grad_norm": 0.8525708317756653, "learning_rate": 0.0002, "epoch": 3.667033370003667, "step": 5000}, {"loss": 0.8496, "grad_norm": 0.7959994673728943, "learning_rate": 0.0002, "epoch": 3.6743674367436743, "step": 5010}, {"loss": 0.8022, "grad_norm": 0.8103628158569336, "learning_rate": 0.0002, "epoch": 3.6817015034836817, "step": 5020}, {"loss": 0.7376, "grad_norm": 0.7517836093902588, "learning_rate": 0.0002, "epoch": 3.689035570223689, "step": 5030}, {"loss": 0.8375, "grad_norm": 0.6878514289855957, "learning_rate": 0.0002, "epoch": 3.6963696369636962, "step": 5040}, {"loss": 0.7998, "grad_norm": 1.2371820211410522, "learning_rate": 0.0002, "epoch": 3.7037037037037037, "step": 5050}, {"loss": 0.6941, "grad_norm": 0.6567103862762451, "learning_rate": 0.0002, "epoch": 3.711037770443711, "step": 5060}, {"loss": 0.8465, "grad_norm": 1.1254922151565552, "learning_rate": 0.0002, "epoch": 3.718371837183718, "step": 5070}, {"loss": 0.8365, "grad_norm": 0.6796132326126099, "learning_rate": 0.0002, "epoch": 3.7257059039237257, "step": 5080}, {"loss": 0.7818, "grad_norm": 0.7285300493240356, "learning_rate": 0.0002, "epoch": 3.733039970663733, "step": 5090}, {"loss": 0.8581, "grad_norm": 0.8931500911712646, "learning_rate": 0.0002, "epoch": 3.7403740374037406, "step": 5100}, {"loss": 0.8181, "grad_norm": 0.6256856918334961, "learning_rate": 0.0002, "epoch": 3.7477081041437477, "step": 5110}, {"loss": 0.743, "grad_norm": 0.79310142993927, "learning_rate": 0.0002, "epoch": 3.755042170883755, "step": 5120}, {"loss": 0.8235, "grad_norm": 0.6594041585922241, "learning_rate": 0.0002, "epoch": 3.762376237623762, "step": 5130}, {"loss": 0.6925, "grad_norm": 0.7029327750205994, "learning_rate": 0.0002, "epoch": 3.7697103043637696, "step": 5140}, {"loss": 0.7457, "grad_norm": 0.5880070328712463, "learning_rate": 0.0002, "epoch": 3.777044371103777, "step": 5150}, {"loss": 0.8716, "grad_norm": 0.7578945159912109, "learning_rate": 0.0002, "epoch": 3.7843784378437846, "step": 5160}, {"loss": 0.8819, "grad_norm": 0.8276378512382507, "learning_rate": 0.0002, "epoch": 3.7917125045837916, "step": 5170}, {"loss": 0.7559, "grad_norm": 0.7627953886985779, "learning_rate": 0.0002, "epoch": 3.799046571323799, "step": 5180}, {"loss": 0.7665, "grad_norm": 0.8169086575508118, "learning_rate": 0.0002, "epoch": 3.806380638063806, "step": 5190}, {"loss": 0.761, "grad_norm": 0.6605030298233032, "learning_rate": 0.0002, "epoch": 3.8137147048038136, "step": 5200}, {"loss": 0.8804, "grad_norm": 0.5837286114692688, "learning_rate": 0.0002, "epoch": 3.821048771543821, "step": 5210}, {"loss": 0.8369, "grad_norm": 1.2422157526016235, "learning_rate": 0.0002, "epoch": 3.8283828382838285, "step": 5220}, {"loss": 0.8431, "grad_norm": 0.6589220762252808, "learning_rate": 0.0002, "epoch": 3.8357169050238356, "step": 5230}, {"loss": 0.7686, "grad_norm": 0.8567556142807007, "learning_rate": 0.0002, "epoch": 3.843050971763843, "step": 5240}, {"loss": 0.8652, "grad_norm": 0.6490627527236938, "learning_rate": 0.0002, "epoch": 3.8503850385038505, "step": 5250}, {"loss": 0.7386, "grad_norm": 0.620232880115509, "learning_rate": 0.0002, "epoch": 3.8577191052438575, "step": 5260}, {"loss": 0.9192, "grad_norm": 0.7685128450393677, "learning_rate": 0.0002, "epoch": 3.865053171983865, "step": 5270}, {"loss": 0.872, "grad_norm": 0.8113296627998352, "learning_rate": 0.0002, "epoch": 3.8723872387238725, "step": 5280}, {"loss": 0.7156, "grad_norm": 0.8092675805091858, "learning_rate": 0.0002, "epoch": 3.87972130546388, "step": 5290}, {"loss": 0.7325, "grad_norm": 0.583570122718811, "learning_rate": 0.0002, "epoch": 3.887055372203887, "step": 5300}, {"loss": 0.9333, "grad_norm": 1.712363600730896, "learning_rate": 0.0002, "epoch": 3.8943894389438944, "step": 5310}, {"loss": 0.7537, "grad_norm": 0.6673534512519836, "learning_rate": 0.0002, "epoch": 3.9017235056839015, "step": 5320}, {"loss": 0.7035, "grad_norm": 1.9770312309265137, "learning_rate": 0.0002, "epoch": 3.909057572423909, "step": 5330}, {"loss": 0.8793, "grad_norm": 0.6430999636650085, "learning_rate": 0.0002, "epoch": 3.9163916391639164, "step": 5340}, {"loss": 0.839, "grad_norm": 1.0159571170806885, "learning_rate": 0.0002, "epoch": 3.923725705903924, "step": 5350}, {"loss": 0.9332, "grad_norm": 0.8607584834098816, "learning_rate": 0.0002, "epoch": 3.931059772643931, "step": 5360}, {"loss": 0.7261, "grad_norm": 0.6967900991439819, "learning_rate": 0.0002, "epoch": 3.9383938393839384, "step": 5370}, {"loss": 0.8456, "grad_norm": 0.7683077454566956, "learning_rate": 0.0002, "epoch": 3.945727906123946, "step": 5380}, {"loss": 0.7682, "grad_norm": 0.6805762648582458, "learning_rate": 0.0002, "epoch": 3.953061972863953, "step": 5390}, {"loss": 0.7746, "grad_norm": 0.7033619284629822, "learning_rate": 0.0002, "epoch": 3.9603960396039604, "step": 5400}, {"loss": 0.8393, "grad_norm": 0.966112494468689, "learning_rate": 0.0002, "epoch": 3.967730106343968, "step": 5410}, {"loss": 0.8316, "grad_norm": 0.8467881083488464, "learning_rate": 0.0002, "epoch": 3.9750641730839753, "step": 5420}, {"loss": 0.8084, "grad_norm": 0.8005317449569702, "learning_rate": 0.0002, "epoch": 3.9823982398239823, "step": 5430}, {"loss": 0.7168, "grad_norm": 1.1615241765975952, "learning_rate": 0.0002, "epoch": 3.98973230656399, "step": 5440}, {"loss": 0.8263, "grad_norm": 0.6121614575386047, "learning_rate": 0.0002, "epoch": 3.997066373303997, "step": 5450}, {"eval_loss": 1.1834222078323364, "eval_runtime": 32.7569, "eval_samples_per_second": 13.158, "eval_steps_per_second": 1.649, "epoch": 4.0, "step": 5454}, {"loss": 0.7267, "grad_norm": 0.6055727005004883, "learning_rate": 0.0002, "epoch": 4.004400440044004, "step": 5460}, {"loss": 0.5766, "grad_norm": 0.8232647180557251, "learning_rate": 0.0002, "epoch": 4.011734506784012, "step": 5470}, {"loss": 0.6489, "grad_norm": 0.7739192247390747, "learning_rate": 0.0002, "epoch": 4.019068573524019, "step": 5480}, {"loss": 0.5978, "grad_norm": 0.6264950633049011, "learning_rate": 0.0002, "epoch": 4.026402640264027, "step": 5490}, {"loss": 0.6392, "grad_norm": 1.4798702001571655, "learning_rate": 0.0002, "epoch": 4.033736707004033, "step": 5500}, {"loss": 0.6143, "grad_norm": 0.9538470506668091, "learning_rate": 0.0002, "epoch": 4.041070773744041, "step": 5510}, {"loss": 0.6056, "grad_norm": 0.834561288356781, "learning_rate": 0.0002, "epoch": 4.048404840484048, "step": 5520}, {"loss": 0.6077, "grad_norm": 0.6407850384712219, "learning_rate": 0.0002, "epoch": 4.055738907224056, "step": 5530}, {"loss": 0.6733, "grad_norm": 0.9035961627960205, "learning_rate": 0.0002, "epoch": 4.063072973964063, "step": 5540}, {"loss": 0.5854, "grad_norm": 0.842812716960907, "learning_rate": 0.0002, "epoch": 4.070407040704071, "step": 5550}, {"loss": 0.654, "grad_norm": 0.8197882175445557, "learning_rate": 0.0002, "epoch": 4.077741107444078, "step": 5560}, {"loss": 0.5919, "grad_norm": 0.8652673959732056, "learning_rate": 0.0002, "epoch": 4.085075174184085, "step": 5570}, {"loss": 0.6188, "grad_norm": 0.8048318028450012, "learning_rate": 0.0002, "epoch": 4.092409240924092, "step": 5580}, {"loss": 0.6487, "grad_norm": 0.9604969024658203, "learning_rate": 0.0002, "epoch": 4.0997433076641, "step": 5590}, {"loss": 0.6356, "grad_norm": 1.244756817817688, "learning_rate": 0.0002, "epoch": 4.107077374404107, "step": 5600}, {"loss": 0.6489, "grad_norm": 0.7975269556045532, "learning_rate": 0.0002, "epoch": 4.114411441144115, "step": 5610}, {"loss": 0.6445, "grad_norm": 0.6130099296569824, "learning_rate": 0.0002, "epoch": 4.121745507884122, "step": 5620}, {"loss": 0.6024, "grad_norm": 0.7793202996253967, "learning_rate": 0.0002, "epoch": 4.129079574624129, "step": 5630}, {"loss": 0.5723, "grad_norm": 1.187238335609436, "learning_rate": 0.0002, "epoch": 4.136413641364136, "step": 5640}, {"loss": 0.6385, "grad_norm": 0.8450375199317932, "learning_rate": 0.0002, "epoch": 4.143747708104144, "step": 5650}, {"loss": 0.6866, "grad_norm": 0.9006940126419067, "learning_rate": 0.0002, "epoch": 4.151081774844151, "step": 5660}, {"loss": 0.6179, "grad_norm": 0.9447154998779297, "learning_rate": 0.0002, "epoch": 4.158415841584159, "step": 5670}, {"loss": 0.6476, "grad_norm": 0.798032283782959, "learning_rate": 0.0002, "epoch": 4.165749908324166, "step": 5680}, {"loss": 0.6666, "grad_norm": 0.65578693151474, "learning_rate": 0.0002, "epoch": 4.1730839750641735, "step": 5690}, {"loss": 0.701, "grad_norm": 1.0864700078964233, "learning_rate": 0.0002, "epoch": 4.18041804180418, "step": 5700}, {"loss": 0.6895, "grad_norm": 0.7344121932983398, "learning_rate": 0.0002, "epoch": 4.187752108544188, "step": 5710}, {"loss": 0.6659, "grad_norm": 0.9722456932067871, "learning_rate": 0.0002, "epoch": 4.195086175284195, "step": 5720}, {"loss": 0.6887, "grad_norm": 1.263814926147461, "learning_rate": 0.0002, "epoch": 4.2024202420242025, "step": 5730}, {"loss": 0.608, "grad_norm": 0.9622581005096436, "learning_rate": 0.0002, "epoch": 4.20975430876421, "step": 5740}, {"loss": 0.6221, "grad_norm": 0.8497143387794495, "learning_rate": 0.0002, "epoch": 4.2170883755042174, "step": 5750}, {"loss": 0.6322, "grad_norm": 0.8248446583747864, "learning_rate": 0.0002, "epoch": 4.224422442244224, "step": 5760}, {"loss": 0.6045, "grad_norm": 1.2544798851013184, "learning_rate": 0.0002, "epoch": 4.2317565089842315, "step": 5770}, {"loss": 0.641, "grad_norm": 0.8224676251411438, "learning_rate": 0.0002, "epoch": 4.239090575724239, "step": 5780}, {"loss": 0.6399, "grad_norm": 0.8924877047538757, "learning_rate": 0.0002, "epoch": 4.2464246424642464, "step": 5790}, {"loss": 0.6845, "grad_norm": 0.8545848727226257, "learning_rate": 0.0002, "epoch": 4.253758709204254, "step": 5800}, {"loss": 0.6669, "grad_norm": 0.8081067800521851, "learning_rate": 0.0002, "epoch": 4.261092775944261, "step": 5810}, {"loss": 0.6149, "grad_norm": 0.7111002802848816, "learning_rate": 0.0002, "epoch": 4.268426842684269, "step": 5820}, {"loss": 0.6343, "grad_norm": 0.8696979880332947, "learning_rate": 0.0002, "epoch": 4.2757609094242754, "step": 5830}, {"loss": 0.6384, "grad_norm": 0.821401834487915, "learning_rate": 0.0002, "epoch": 4.283094976164283, "step": 5840}, {"loss": 0.6912, "grad_norm": 0.888908326625824, "learning_rate": 0.0002, "epoch": 4.29042904290429, "step": 5850}, {"loss": 0.6061, "grad_norm": 1.9380123615264893, "learning_rate": 0.0002, "epoch": 4.297763109644298, "step": 5860}, {"loss": 0.6766, "grad_norm": 1.121774435043335, "learning_rate": 0.0002, "epoch": 4.305097176384305, "step": 5870}, {"loss": 0.7205, "grad_norm": 0.9238282442092896, "learning_rate": 0.0002, "epoch": 4.312431243124313, "step": 5880}, {"loss": 0.6351, "grad_norm": 0.7321620583534241, "learning_rate": 0.0002, "epoch": 4.319765309864319, "step": 5890}, {"loss": 0.6404, "grad_norm": 0.8739548325538635, "learning_rate": 0.0002, "epoch": 4.327099376604327, "step": 5900}, {"loss": 0.5892, "grad_norm": 0.9686012268066406, "learning_rate": 0.0002, "epoch": 4.334433443344334, "step": 5910}, {"loss": 0.641, "grad_norm": 0.9033839106559753, "learning_rate": 0.0002, "epoch": 4.341767510084342, "step": 5920}, {"loss": 0.6456, "grad_norm": 0.8131115436553955, "learning_rate": 0.0002, "epoch": 4.349101576824349, "step": 5930}, {"loss": 0.5826, "grad_norm": 0.8942412734031677, "learning_rate": 0.0002, "epoch": 4.356435643564357, "step": 5940}, {"loss": 0.7336, "grad_norm": 0.8439112901687622, "learning_rate": 0.0002, "epoch": 4.363769710304364, "step": 5950}, {"loss": 0.6537, "grad_norm": 0.9176713228225708, "learning_rate": 0.0002, "epoch": 4.371103777044371, "step": 5960}, {"loss": 0.6792, "grad_norm": 0.6799634695053101, "learning_rate": 0.0002, "epoch": 4.378437843784378, "step": 5970}, {"loss": 0.7266, "grad_norm": 1.0435824394226074, "learning_rate": 0.0002, "epoch": 4.385771910524386, "step": 5980}, {"loss": 0.68, "grad_norm": 0.997937798500061, "learning_rate": 0.0002, "epoch": 4.393105977264393, "step": 5990}, {"loss": 0.6604, "grad_norm": 1.0308842658996582, "learning_rate": 0.0002, "epoch": 4.400440044004401, "step": 6000}, {"loss": 0.6402, "grad_norm": 1.3683775663375854, "learning_rate": 0.0002, "epoch": 4.407774110744408, "step": 6010}, {"loss": 0.7027, "grad_norm": 0.7569534182548523, "learning_rate": 0.0002, "epoch": 4.415108177484415, "step": 6020}, {"loss": 0.5949, "grad_norm": 1.089978575706482, "learning_rate": 0.0002, "epoch": 4.422442244224422, "step": 6030}, {"loss": 0.6353, "grad_norm": 0.7522459626197815, "learning_rate": 0.0002, "epoch": 4.42977631096443, "step": 6040}, {"loss": 0.5852, "grad_norm": 0.6709823608398438, "learning_rate": 0.0002, "epoch": 4.437110377704437, "step": 6050}, {"loss": 0.6718, "grad_norm": 0.6992089748382568, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6060}, {"loss": 0.6933, "grad_norm": 1.0182931423187256, "learning_rate": 0.0002, "epoch": 4.451778511184452, "step": 6070}, {"loss": 0.6255, "grad_norm": 1.0685160160064697, "learning_rate": 0.0002, "epoch": 4.459112577924459, "step": 6080}, {"loss": 0.6086, "grad_norm": 0.8295124769210815, "learning_rate": 0.0002, "epoch": 4.466446644664466, "step": 6090}, {"loss": 0.6359, "grad_norm": 1.1862998008728027, "learning_rate": 0.0002, "epoch": 4.473780711404474, "step": 6100}, {"loss": 0.638, "grad_norm": 0.7400273084640503, "learning_rate": 0.0002, "epoch": 4.481114778144481, "step": 6110}, {"loss": 0.6854, "grad_norm": 0.7098417282104492, "learning_rate": 0.0002, "epoch": 4.488448844884489, "step": 6120}, {"loss": 0.6976, "grad_norm": 0.9745053648948669, "learning_rate": 0.0002, "epoch": 4.495782911624496, "step": 6130}, {"loss": 0.605, "grad_norm": 0.8638797998428345, "learning_rate": 0.0002, "epoch": 4.503116978364503, "step": 6140}, {"loss": 0.6491, "grad_norm": 0.8291046619415283, "learning_rate": 0.0002, "epoch": 4.51045104510451, "step": 6150}, {"loss": 0.6457, "grad_norm": 1.0301737785339355, "learning_rate": 0.0002, "epoch": 4.517785111844518, "step": 6160}, {"loss": 0.6742, "grad_norm": 1.1996512413024902, "learning_rate": 0.0002, "epoch": 4.525119178584525, "step": 6170}, {"loss": 0.6484, "grad_norm": 1.151038408279419, "learning_rate": 0.0002, "epoch": 4.5324532453245325, "step": 6180}, {"loss": 0.668, "grad_norm": 0.8385201096534729, "learning_rate": 0.0002, "epoch": 4.53978731206454, "step": 6190}, {"loss": 0.6381, "grad_norm": 0.8969188332557678, "learning_rate": 0.0002, "epoch": 4.5471213788045475, "step": 6200}, {"loss": 0.7141, "grad_norm": 1.60659658908844, "learning_rate": 0.0002, "epoch": 4.554455445544555, "step": 6210}, {"loss": 0.6388, "grad_norm": 0.9356731176376343, "learning_rate": 0.0002, "epoch": 4.5617895122845615, "step": 6220}, {"loss": 0.7393, "grad_norm": 0.95856773853302, "learning_rate": 0.0002, "epoch": 4.569123579024569, "step": 6230}, {"loss": 0.6554, "grad_norm": 1.1162524223327637, "learning_rate": 0.0002, "epoch": 4.5764576457645765, "step": 6240}, {"loss": 0.6012, "grad_norm": 0.8809238076210022, "learning_rate": 0.0002, "epoch": 4.583791712504584, "step": 6250}, {"loss": 0.648, "grad_norm": 0.890738844871521, "learning_rate": 0.0002, "epoch": 4.591125779244591, "step": 6260}, {"loss": 0.6663, "grad_norm": 0.918684720993042, "learning_rate": 0.0002, "epoch": 4.598459845984598, "step": 6270}, {"loss": 0.5992, "grad_norm": 0.8156296610832214, "learning_rate": 0.0002, "epoch": 4.6057939127246055, "step": 6280}, {"loss": 0.723, "grad_norm": 1.046634316444397, "learning_rate": 0.0002, "epoch": 4.613127979464613, "step": 6290}, {"loss": 0.7023, "grad_norm": 0.7725525498390198, "learning_rate": 0.0002, "epoch": 4.62046204620462, "step": 6300}, {"loss": 0.6414, "grad_norm": 0.9992046356201172, "learning_rate": 0.0002, "epoch": 4.627796112944628, "step": 6310}, {"loss": 0.6201, "grad_norm": 0.8480095267295837, "learning_rate": 0.0002, "epoch": 4.635130179684635, "step": 6320}, {"loss": 0.6869, "grad_norm": 0.7061955332756042, "learning_rate": 0.0002, "epoch": 4.642464246424643, "step": 6330}, {"loss": 0.6828, "grad_norm": 1.0354212522506714, "learning_rate": 0.0002, "epoch": 4.649798313164649, "step": 6340}, {"loss": 0.6651, "grad_norm": 1.0081377029418945, "learning_rate": 0.0002, "epoch": 4.657132379904657, "step": 6350}, {"loss": 0.726, "grad_norm": 1.2904249429702759, "learning_rate": 0.0002, "epoch": 4.664466446644664, "step": 6360}, {"loss": 0.7148, "grad_norm": 0.9248910546302795, "learning_rate": 0.0002, "epoch": 4.671800513384672, "step": 6370}, {"loss": 0.6961, "grad_norm": 0.9907804131507874, "learning_rate": 0.0002, "epoch": 4.679134580124679, "step": 6380}, {"loss": 0.6163, "grad_norm": 1.201143741607666, "learning_rate": 0.0002, "epoch": 4.686468646864687, "step": 6390}, {"loss": 0.6762, "grad_norm": 0.8709394335746765, "learning_rate": 0.0002, "epoch": 4.693802713604693, "step": 6400}, {"loss": 0.7217, "grad_norm": 0.7468608021736145, "learning_rate": 0.0002, "epoch": 4.701136780344701, "step": 6410}, {"loss": 0.6548, "grad_norm": 0.8607903718948364, "learning_rate": 0.0002, "epoch": 4.708470847084708, "step": 6420}, {"loss": 0.6449, "grad_norm": 0.9840512871742249, "learning_rate": 0.0002, "epoch": 4.715804913824716, "step": 6430}, {"loss": 0.685, "grad_norm": 0.8328204154968262, "learning_rate": 0.0002, "epoch": 4.723138980564723, "step": 6440}, {"loss": 0.697, "grad_norm": 0.924505352973938, "learning_rate": 0.0002, "epoch": 4.730473047304731, "step": 6450}, {"loss": 0.7422, "grad_norm": 0.8897685408592224, "learning_rate": 0.0002, "epoch": 4.737807114044738, "step": 6460}, {"loss": 0.6842, "grad_norm": 0.9605024456977844, "learning_rate": 0.0002, "epoch": 4.745141180784745, "step": 6470}, {"loss": 0.6488, "grad_norm": 0.8150759935379028, "learning_rate": 0.0002, "epoch": 4.752475247524752, "step": 6480}, {"loss": 0.6606, "grad_norm": 0.8128412961959839, "learning_rate": 0.0002, "epoch": 4.75980931426476, "step": 6490}, {"loss": 0.6729, "grad_norm": 0.7381404638290405, "learning_rate": 0.0002, "epoch": 4.767143381004767, "step": 6500}, {"loss": 0.6713, "grad_norm": 1.0565853118896484, "learning_rate": 0.0002, "epoch": 4.774477447744775, "step": 6510}, {"loss": 0.6496, "grad_norm": 0.9298134446144104, "learning_rate": 0.0002, "epoch": 4.781811514484782, "step": 6520}, {"loss": 0.7279, "grad_norm": 1.0145525932312012, "learning_rate": 0.0002, "epoch": 4.789145581224789, "step": 6530}, {"loss": 0.5986, "grad_norm": 0.92259681224823, "learning_rate": 0.0002, "epoch": 4.796479647964796, "step": 6540}, {"loss": 0.63, "grad_norm": 0.7881024479866028, "learning_rate": 0.0002, "epoch": 4.803813714704804, "step": 6550}, {"loss": 0.7134, "grad_norm": 1.4935206174850464, "learning_rate": 0.0002, "epoch": 4.811147781444811, "step": 6560}, {"loss": 0.6695, "grad_norm": 0.8612369298934937, "learning_rate": 0.0002, "epoch": 4.818481848184819, "step": 6570}, {"loss": 0.779, "grad_norm": 1.0118653774261475, "learning_rate": 0.0002, "epoch": 4.825815914924826, "step": 6580}, {"loss": 0.6991, "grad_norm": 1.1303809881210327, "learning_rate": 0.0002, "epoch": 4.833149981664834, "step": 6590}, {"loss": 0.7887, "grad_norm": 0.9112492203712463, "learning_rate": 0.0002, "epoch": 4.84048404840484, "step": 6600}, {"loss": 0.7699, "grad_norm": 0.864762544631958, "learning_rate": 0.0002, "epoch": 4.847818115144848, "step": 6610}, {"loss": 0.7347, "grad_norm": 0.9090572595596313, "learning_rate": 0.0002, "epoch": 4.855152181884855, "step": 6620}, {"loss": 0.6608, "grad_norm": 1.014953374862671, "learning_rate": 0.0002, "epoch": 4.862486248624863, "step": 6630}, {"loss": 0.6429, "grad_norm": 1.0702149868011475, "learning_rate": 0.0002, "epoch": 4.86982031536487, "step": 6640}, {"loss": 0.6943, "grad_norm": 1.002135157585144, "learning_rate": 0.0002, "epoch": 4.8771543821048775, "step": 6650}, {"loss": 0.7225, "grad_norm": 0.862545907497406, "learning_rate": 0.0002, "epoch": 4.884488448844884, "step": 6660}, {"loss": 0.6206, "grad_norm": 0.7302131056785583, "learning_rate": 0.0002, "epoch": 4.891822515584892, "step": 6670}, {"loss": 0.7175, "grad_norm": 0.8380730152130127, "learning_rate": 0.0002, "epoch": 4.899156582324899, "step": 6680}, {"loss": 0.645, "grad_norm": 0.7956018447875977, "learning_rate": 0.0002, "epoch": 4.9064906490649065, "step": 6690}, {"loss": 0.6431, "grad_norm": 0.6717583537101746, "learning_rate": 0.0002, "epoch": 4.913824715804914, "step": 6700}, {"loss": 0.6942, "grad_norm": 1.09099280834198, "learning_rate": 0.0002, "epoch": 4.9211587825449215, "step": 6710}, {"loss": 0.7533, "grad_norm": 0.8589889407157898, "learning_rate": 0.0002, "epoch": 4.928492849284929, "step": 6720}, {"loss": 0.66, "grad_norm": 1.0046314001083374, "learning_rate": 0.0002, "epoch": 4.9358269160249355, "step": 6730}, {"loss": 0.6864, "grad_norm": 0.8559659123420715, "learning_rate": 0.0002, "epoch": 4.943160982764943, "step": 6740}, {"loss": 0.6847, "grad_norm": 0.8588525652885437, "learning_rate": 0.0002, "epoch": 4.9504950495049505, "step": 6750}, {"loss": 0.6428, "grad_norm": 0.9192708134651184, "learning_rate": 0.0002, "epoch": 4.957829116244958, "step": 6760}, {"loss": 0.6873, "grad_norm": 1.051398754119873, "learning_rate": 0.0002, "epoch": 4.965163182984965, "step": 6770}, {"loss": 0.7249, "grad_norm": 0.9111362099647522, "learning_rate": 0.0002, "epoch": 4.972497249724973, "step": 6780}, {"loss": 0.7613, "grad_norm": 0.7305638194084167, "learning_rate": 0.0002, "epoch": 4.9798313164649795, "step": 6790}, {"loss": 0.6747, "grad_norm": 1.118837594985962, "learning_rate": 0.0002, "epoch": 4.987165383204987, "step": 6800}, {"loss": 0.6412, "grad_norm": 0.9075239300727844, "learning_rate": 0.0002, "epoch": 4.994499449944994, "step": 6810}]} +{"epoch": 6.0, "step": 8181, "epoch_duration": 1471.2244851589203, "total_accumulated_duration": 8848.251735210419, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9722, "grad_norm": 0.47521963715553284, "learning_rate": 0.0002, "epoch": 0.007334066740007334, "step": 10}, {"loss": 1.4821, "grad_norm": 0.5395162105560303, "learning_rate": 0.0002, "epoch": 0.014668133480014669, "step": 20}, {"loss": 1.4202, "grad_norm": 0.4305780231952667, "learning_rate": 0.0002, "epoch": 0.022002200220022004, "step": 30}, {"loss": 1.4271, "grad_norm": 0.6938246488571167, "learning_rate": 0.0002, "epoch": 0.029336266960029337, "step": 40}, {"loss": 1.3112, "grad_norm": 1.5133819580078125, "learning_rate": 0.0002, "epoch": 0.03667033370003667, "step": 50}, {"loss": 1.3132, "grad_norm": 0.9173883199691772, "learning_rate": 0.0002, "epoch": 0.04400440044004401, "step": 60}, {"loss": 1.2844, "grad_norm": 0.4619861841201782, "learning_rate": 0.0002, "epoch": 0.05133846718005134, "step": 70}, {"loss": 1.2108, "grad_norm": 0.46118637919425964, "learning_rate": 0.0002, "epoch": 0.058672533920058674, "step": 80}, {"loss": 1.3441, "grad_norm": 0.4468648135662079, "learning_rate": 0.0002, "epoch": 0.066006600660066, "step": 90}, {"loss": 1.1863, "grad_norm": 0.46123769879341125, "learning_rate": 0.0002, "epoch": 0.07334066740007333, "step": 100}, {"loss": 1.2772, "grad_norm": 0.4859139025211334, "learning_rate": 0.0002, "epoch": 0.08067473414008068, "step": 110}, {"loss": 1.2087, "grad_norm": 0.4384922385215759, "learning_rate": 0.0002, "epoch": 0.08800880088008801, "step": 120}, {"loss": 1.2927, "grad_norm": 0.39519360661506653, "learning_rate": 0.0002, "epoch": 0.09534286762009535, "step": 130}, {"loss": 1.2349, "grad_norm": 0.4049859344959259, "learning_rate": 0.0002, "epoch": 0.10267693436010268, "step": 140}, {"loss": 1.293, "grad_norm": 0.4605638086795807, "learning_rate": 0.0002, "epoch": 0.11001100110011001, "step": 150}, {"loss": 1.2659, "grad_norm": 0.4201928377151489, "learning_rate": 0.0002, "epoch": 0.11734506784011735, "step": 160}, {"loss": 1.3961, "grad_norm": 0.5367777347564697, "learning_rate": 0.0002, "epoch": 0.12467913458012468, "step": 170}, {"loss": 1.2481, "grad_norm": 0.41752299666404724, "learning_rate": 0.0002, "epoch": 0.132013201320132, "step": 180}, {"loss": 1.207, "grad_norm": 0.31597763299942017, "learning_rate": 0.0002, "epoch": 0.13934726806013933, "step": 190}, {"loss": 1.2441, "grad_norm": 0.7468788623809814, "learning_rate": 0.0002, "epoch": 0.14668133480014667, "step": 200}, {"loss": 1.199, "grad_norm": 0.3403034508228302, "learning_rate": 0.0002, "epoch": 0.15401540154015403, "step": 210}, {"loss": 1.2439, "grad_norm": 0.34240293502807617, "learning_rate": 0.0002, "epoch": 0.16134946828016136, "step": 220}, {"loss": 1.2022, "grad_norm": 0.356158971786499, "learning_rate": 0.0002, "epoch": 0.1686835350201687, "step": 230}, {"loss": 1.207, "grad_norm": 0.3448857367038727, "learning_rate": 0.0002, "epoch": 0.17601760176017603, "step": 240}, {"loss": 1.2156, "grad_norm": 0.3475699722766876, "learning_rate": 0.0002, "epoch": 0.18335166850018336, "step": 250}, {"loss": 1.1551, "grad_norm": 0.2770358622074127, "learning_rate": 0.0002, "epoch": 0.1906857352401907, "step": 260}, {"loss": 1.2238, "grad_norm": 0.4310270845890045, "learning_rate": 0.0002, "epoch": 0.19801980198019803, "step": 270}, {"loss": 1.2917, "grad_norm": 0.335041880607605, "learning_rate": 0.0002, "epoch": 0.20535386872020536, "step": 280}, {"loss": 1.0959, "grad_norm": 0.3420602083206177, "learning_rate": 0.0002, "epoch": 0.2126879354602127, "step": 290}, {"loss": 1.1232, "grad_norm": 0.325001060962677, "learning_rate": 0.0002, "epoch": 0.22002200220022003, "step": 300}, {"loss": 1.2007, "grad_norm": 0.3027827739715576, "learning_rate": 0.0002, "epoch": 0.22735606894022736, "step": 310}, {"loss": 1.1803, "grad_norm": 0.435550719499588, "learning_rate": 0.0002, "epoch": 0.2346901356802347, "step": 320}, {"loss": 1.2045, "grad_norm": 0.3884522616863251, "learning_rate": 0.0002, "epoch": 0.24202420242024203, "step": 330}, {"loss": 1.2481, "grad_norm": 0.7736002206802368, "learning_rate": 0.0002, "epoch": 0.24935826916024936, "step": 340}, {"loss": 1.3606, "grad_norm": 0.35052821040153503, "learning_rate": 0.0002, "epoch": 0.2566923359002567, "step": 350}, {"loss": 1.2129, "grad_norm": 0.3311890959739685, "learning_rate": 0.0002, "epoch": 0.264026402640264, "step": 360}, {"loss": 1.2219, "grad_norm": 0.7473500370979309, "learning_rate": 0.0002, "epoch": 0.27136046938027136, "step": 370}, {"loss": 1.2712, "grad_norm": 0.3681875765323639, "learning_rate": 0.0002, "epoch": 0.27869453612027867, "step": 380}, {"loss": 1.2258, "grad_norm": 0.3764737844467163, "learning_rate": 0.0002, "epoch": 0.28602860286028603, "step": 390}, {"loss": 1.1917, "grad_norm": 0.4243989586830139, "learning_rate": 0.0002, "epoch": 0.29336266960029334, "step": 400}, {"loss": 1.199, "grad_norm": 0.2658531963825226, "learning_rate": 0.0002, "epoch": 0.3006967363403007, "step": 410}, {"loss": 1.1622, "grad_norm": 0.3436793386936188, "learning_rate": 0.0002, "epoch": 0.30803080308030806, "step": 420}, {"loss": 1.2953, "grad_norm": 0.5101129412651062, "learning_rate": 0.0002, "epoch": 0.31536486982031536, "step": 430}, {"loss": 1.1557, "grad_norm": 0.3319750726222992, "learning_rate": 0.0002, "epoch": 0.3226989365603227, "step": 440}, {"loss": 1.1804, "grad_norm": 0.385148286819458, "learning_rate": 0.0002, "epoch": 0.33003300330033003, "step": 450}, {"loss": 1.1808, "grad_norm": 0.3477935791015625, "learning_rate": 0.0002, "epoch": 0.3373670700403374, "step": 460}, {"loss": 1.1877, "grad_norm": 0.29748716950416565, "learning_rate": 0.0002, "epoch": 0.3447011367803447, "step": 470}, {"loss": 1.19, "grad_norm": 0.34083324670791626, "learning_rate": 0.0002, "epoch": 0.35203520352035206, "step": 480}, {"loss": 1.2, "grad_norm": 0.36904552578926086, "learning_rate": 0.0002, "epoch": 0.35936927026035936, "step": 490}, {"loss": 1.2223, "grad_norm": 0.315483033657074, "learning_rate": 0.0002, "epoch": 0.3667033370003667, "step": 500}, {"loss": 1.1461, "grad_norm": 0.44897955656051636, "learning_rate": 0.0002, "epoch": 0.37403740374037403, "step": 510}, {"loss": 1.3035, "grad_norm": 0.3160701394081116, "learning_rate": 0.0002, "epoch": 0.3813714704803814, "step": 520}, {"loss": 1.3197, "grad_norm": 0.29584741592407227, "learning_rate": 0.0002, "epoch": 0.3887055372203887, "step": 530}, {"loss": 1.2983, "grad_norm": 0.5430002808570862, "learning_rate": 0.0002, "epoch": 0.39603960396039606, "step": 540}, {"loss": 1.2459, "grad_norm": 0.2908070683479309, "learning_rate": 0.0002, "epoch": 0.40337367070040336, "step": 550}, {"loss": 1.2384, "grad_norm": 0.35066530108451843, "learning_rate": 0.0002, "epoch": 0.4107077374404107, "step": 560}, {"loss": 1.1784, "grad_norm": 0.37588003277778625, "learning_rate": 0.0002, "epoch": 0.41804180418041803, "step": 570}, {"loss": 1.2334, "grad_norm": 0.3112126886844635, "learning_rate": 0.0002, "epoch": 0.4253758709204254, "step": 580}, {"loss": 1.1439, "grad_norm": 0.35577139258384705, "learning_rate": 0.0002, "epoch": 0.4327099376604327, "step": 590}, {"loss": 1.184, "grad_norm": 0.31706422567367554, "learning_rate": 0.0002, "epoch": 0.44004400440044006, "step": 600}, {"loss": 1.2081, "grad_norm": 0.3249092102050781, "learning_rate": 0.0002, "epoch": 0.44737807114044736, "step": 610}, {"loss": 1.0824, "grad_norm": 0.3842705488204956, "learning_rate": 0.0002, "epoch": 0.4547121378804547, "step": 620}, {"loss": 1.2257, "grad_norm": 0.390991747379303, "learning_rate": 0.0002, "epoch": 0.46204620462046203, "step": 630}, {"loss": 1.1954, "grad_norm": 0.27532413601875305, "learning_rate": 0.0002, "epoch": 0.4693802713604694, "step": 640}, {"loss": 1.1058, "grad_norm": 0.31412816047668457, "learning_rate": 0.0002, "epoch": 0.4767143381004767, "step": 650}, {"loss": 1.1312, "grad_norm": 0.32117101550102234, "learning_rate": 0.0002, "epoch": 0.48404840484048406, "step": 660}, {"loss": 1.2423, "grad_norm": 0.3810010254383087, "learning_rate": 0.0002, "epoch": 0.49138247158049136, "step": 670}, {"loss": 1.1978, "grad_norm": 0.36289164423942566, "learning_rate": 0.0002, "epoch": 0.4987165383204987, "step": 680}, {"loss": 1.2034, "grad_norm": 0.34458720684051514, "learning_rate": 0.0002, "epoch": 0.506050605060506, "step": 690}, {"loss": 1.1756, "grad_norm": 0.32844600081443787, "learning_rate": 0.0002, "epoch": 0.5133846718005134, "step": 700}, {"loss": 1.0807, "grad_norm": 0.3144175708293915, "learning_rate": 0.0002, "epoch": 0.5207187385405208, "step": 710}, {"loss": 1.1952, "grad_norm": 0.3898887634277344, "learning_rate": 0.0002, "epoch": 0.528052805280528, "step": 720}, {"loss": 1.1244, "grad_norm": 1.3220758438110352, "learning_rate": 0.0002, "epoch": 0.5353868720205354, "step": 730}, {"loss": 1.227, "grad_norm": 0.3635874390602112, "learning_rate": 0.0002, "epoch": 0.5427209387605427, "step": 740}, {"loss": 1.2169, "grad_norm": 0.3138217628002167, "learning_rate": 0.0002, "epoch": 0.5500550055005501, "step": 750}, {"loss": 1.1516, "grad_norm": 0.4063207805156708, "learning_rate": 0.0002, "epoch": 0.5573890722405573, "step": 760}, {"loss": 1.1954, "grad_norm": 0.3926219940185547, "learning_rate": 0.0002, "epoch": 0.5647231389805647, "step": 770}, {"loss": 1.1726, "grad_norm": 0.31954652070999146, "learning_rate": 0.0002, "epoch": 0.5720572057205721, "step": 780}, {"loss": 1.2977, "grad_norm": 0.4248711168766022, "learning_rate": 0.0002, "epoch": 0.5793912724605794, "step": 790}, {"loss": 1.1728, "grad_norm": 0.643004834651947, "learning_rate": 0.0002, "epoch": 0.5867253392005867, "step": 800}, {"loss": 1.1793, "grad_norm": 0.3479592800140381, "learning_rate": 0.0002, "epoch": 0.594059405940594, "step": 810}, {"loss": 1.2426, "grad_norm": 0.4684754014015198, "learning_rate": 0.0002, "epoch": 0.6013934726806014, "step": 820}, {"loss": 1.2002, "grad_norm": 0.3739790916442871, "learning_rate": 0.0002, "epoch": 0.6087275394206088, "step": 830}, {"loss": 1.2139, "grad_norm": 0.40884748101234436, "learning_rate": 0.0002, "epoch": 0.6160616061606161, "step": 840}, {"loss": 1.1557, "grad_norm": 0.9722164273262024, "learning_rate": 0.0002, "epoch": 0.6233956729006234, "step": 850}, {"loss": 1.3069, "grad_norm": 0.42992347478866577, "learning_rate": 0.0002, "epoch": 0.6307297396406307, "step": 860}, {"loss": 1.1339, "grad_norm": 0.36654195189476013, "learning_rate": 0.0002, "epoch": 0.6380638063806381, "step": 870}, {"loss": 1.1932, "grad_norm": 0.4113832116127014, "learning_rate": 0.0002, "epoch": 0.6453978731206454, "step": 880}, {"loss": 1.2163, "grad_norm": 0.2948838770389557, "learning_rate": 0.0002, "epoch": 0.6527319398606527, "step": 890}, {"loss": 1.1081, "grad_norm": 0.38330280780792236, "learning_rate": 0.0002, "epoch": 0.6600660066006601, "step": 900}, {"loss": 1.1342, "grad_norm": 0.4428867697715759, "learning_rate": 0.0002, "epoch": 0.6674000733406674, "step": 910}, {"loss": 1.1021, "grad_norm": 0.23659265041351318, "learning_rate": 0.0002, "epoch": 0.6747341400806748, "step": 920}, {"loss": 1.1226, "grad_norm": 0.323685884475708, "learning_rate": 0.0002, "epoch": 0.682068206820682, "step": 930}, {"loss": 1.0853, "grad_norm": 0.39157727360725403, "learning_rate": 0.0002, "epoch": 0.6894022735606894, "step": 940}, {"loss": 1.1435, "grad_norm": 0.27189481258392334, "learning_rate": 0.0002, "epoch": 0.6967363403006968, "step": 950}, {"loss": 1.1033, "grad_norm": 0.529883861541748, "learning_rate": 0.0002, "epoch": 0.7040704070407041, "step": 960}, {"loss": 1.139, "grad_norm": 0.34758689999580383, "learning_rate": 0.0002, "epoch": 0.7114044737807114, "step": 970}, {"loss": 1.2197, "grad_norm": 0.831749439239502, "learning_rate": 0.0002, "epoch": 0.7187385405207187, "step": 980}, {"loss": 1.158, "grad_norm": 0.4438304007053375, "learning_rate": 0.0002, "epoch": 0.7260726072607261, "step": 990}, {"loss": 1.1021, "grad_norm": 0.33840006589889526, "learning_rate": 0.0002, "epoch": 0.7334066740007334, "step": 1000}, {"loss": 1.254, "grad_norm": 0.3454797863960266, "learning_rate": 0.0002, "epoch": 0.7407407407407407, "step": 1010}, {"loss": 1.106, "grad_norm": 0.38999441266059875, "learning_rate": 0.0002, "epoch": 0.7480748074807481, "step": 1020}, {"loss": 1.1428, "grad_norm": 0.2829911708831787, "learning_rate": 0.0002, "epoch": 0.7554088742207554, "step": 1030}, {"loss": 1.2123, "grad_norm": 0.36918163299560547, "learning_rate": 0.0002, "epoch": 0.7627429409607628, "step": 1040}, {"loss": 1.3028, "grad_norm": 0.3415680229663849, "learning_rate": 0.0002, "epoch": 0.77007700770077, "step": 1050}, {"loss": 1.1939, "grad_norm": 0.2974182963371277, "learning_rate": 0.0002, "epoch": 0.7774110744407774, "step": 1060}, {"loss": 1.194, "grad_norm": 0.3880919814109802, "learning_rate": 0.0002, "epoch": 0.7847451411807848, "step": 1070}, {"loss": 1.1095, "grad_norm": 0.33503302931785583, "learning_rate": 0.0002, "epoch": 0.7920792079207921, "step": 1080}, {"loss": 1.2111, "grad_norm": 0.3728407025337219, "learning_rate": 0.0002, "epoch": 0.7994132746607994, "step": 1090}, {"loss": 1.0835, "grad_norm": 0.3509373664855957, "learning_rate": 0.0002, "epoch": 0.8067473414008067, "step": 1100}, {"loss": 1.2661, "grad_norm": 0.42228564620018005, "learning_rate": 0.0002, "epoch": 0.8140814081408141, "step": 1110}, {"loss": 1.1788, "grad_norm": 0.313467800617218, "learning_rate": 0.0002, "epoch": 0.8214154748808215, "step": 1120}, {"loss": 1.1971, "grad_norm": 0.3378850817680359, "learning_rate": 0.0002, "epoch": 0.8287495416208287, "step": 1130}, {"loss": 1.1238, "grad_norm": 0.43200382590293884, "learning_rate": 0.0002, "epoch": 0.8360836083608361, "step": 1140}, {"loss": 1.3203, "grad_norm": 0.3309599459171295, "learning_rate": 0.0002, "epoch": 0.8434176751008434, "step": 1150}, {"loss": 1.1062, "grad_norm": 0.3526846170425415, "learning_rate": 0.0002, "epoch": 0.8507517418408508, "step": 1160}, {"loss": 1.0851, "grad_norm": 1.2722247838974, "learning_rate": 0.0002, "epoch": 0.858085808580858, "step": 1170}, {"loss": 1.0785, "grad_norm": 0.34142059087753296, "learning_rate": 0.0002, "epoch": 0.8654198753208654, "step": 1180}, {"loss": 1.2187, "grad_norm": 0.3805823028087616, "learning_rate": 0.0002, "epoch": 0.8727539420608728, "step": 1190}, {"loss": 1.1215, "grad_norm": 0.3931232690811157, "learning_rate": 0.0002, "epoch": 0.8800880088008801, "step": 1200}, {"loss": 1.0948, "grad_norm": 0.2937372624874115, "learning_rate": 0.0002, "epoch": 0.8874220755408874, "step": 1210}, {"loss": 1.1228, "grad_norm": 0.3757196366786957, "learning_rate": 0.0002, "epoch": 0.8947561422808947, "step": 1220}, {"loss": 1.1222, "grad_norm": 0.3502705991268158, "learning_rate": 0.0002, "epoch": 0.9020902090209021, "step": 1230}, {"loss": 1.2242, "grad_norm": 0.32758915424346924, "learning_rate": 0.0002, "epoch": 0.9094242757609095, "step": 1240}, {"loss": 1.215, "grad_norm": 0.37199416756629944, "learning_rate": 0.0002, "epoch": 0.9167583425009168, "step": 1250}, {"loss": 1.1225, "grad_norm": 0.3551490604877472, "learning_rate": 0.0002, "epoch": 0.9240924092409241, "step": 1260}, {"loss": 1.1966, "grad_norm": 0.2859550714492798, "learning_rate": 0.0002, "epoch": 0.9314264759809314, "step": 1270}, {"loss": 1.2186, "grad_norm": 0.427990585565567, "learning_rate": 0.0002, "epoch": 0.9387605427209388, "step": 1280}, {"loss": 1.2848, "grad_norm": 0.33717992901802063, "learning_rate": 0.0002, "epoch": 0.9460946094609461, "step": 1290}, {"loss": 1.1656, "grad_norm": 0.30225634574890137, "learning_rate": 0.0002, "epoch": 0.9534286762009534, "step": 1300}, {"loss": 1.2404, "grad_norm": 0.385821133852005, "learning_rate": 0.0002, "epoch": 0.9607627429409608, "step": 1310}, {"loss": 1.1932, "grad_norm": 0.35278066992759705, "learning_rate": 0.0002, "epoch": 0.9680968096809681, "step": 1320}, {"loss": 1.1071, "grad_norm": 0.49987098574638367, "learning_rate": 0.0002, "epoch": 0.9754308764209755, "step": 1330}, {"loss": 1.2259, "grad_norm": 0.3842747211456299, "learning_rate": 0.0002, "epoch": 0.9827649431609827, "step": 1340}, {"loss": 1.0862, "grad_norm": 0.6274653673171997, "learning_rate": 0.0002, "epoch": 0.9900990099009901, "step": 1350}, {"loss": 1.124, "grad_norm": 0.5239808559417725, "learning_rate": 0.0002, "epoch": 0.9974330766409975, "step": 1360}, {"eval_loss": 1.1822267770767212, "eval_runtime": 32.7389, "eval_samples_per_second": 13.165, "eval_steps_per_second": 1.649, "epoch": 0.9996332966629996, "step": 1363}, {"loss": 1.096, "grad_norm": 0.45311301946640015, "learning_rate": 0.0002, "epoch": 1.0047671433810048, "step": 1370}, {"loss": 1.0143, "grad_norm": 0.29685574769973755, "learning_rate": 0.0002, "epoch": 1.012101210121012, "step": 1380}, {"loss": 1.0302, "grad_norm": 0.3290937840938568, "learning_rate": 0.0002, "epoch": 1.0194352768610195, "step": 1390}, {"loss": 1.0295, "grad_norm": 0.3801758587360382, "learning_rate": 0.0002, "epoch": 1.0267693436010268, "step": 1400}, {"loss": 1.1226, "grad_norm": 0.794174313545227, "learning_rate": 0.0002, "epoch": 1.034103410341034, "step": 1410}, {"loss": 1.2232, "grad_norm": 0.3854154646396637, "learning_rate": 0.0002, "epoch": 1.0414374770810415, "step": 1420}, {"loss": 1.0652, "grad_norm": 0.32702451944351196, "learning_rate": 0.0002, "epoch": 1.0487715438210488, "step": 1430}, {"loss": 1.1144, "grad_norm": 0.7815203666687012, "learning_rate": 0.0002, "epoch": 1.056105610561056, "step": 1440}, {"loss": 1.1316, "grad_norm": 0.3087436854839325, "learning_rate": 0.0002, "epoch": 1.0634396773010635, "step": 1450}, {"loss": 1.1124, "grad_norm": 0.3847602903842926, "learning_rate": 0.0002, "epoch": 1.0707737440410707, "step": 1460}, {"loss": 1.1428, "grad_norm": 0.3693031370639801, "learning_rate": 0.0002, "epoch": 1.0781078107810782, "step": 1470}, {"loss": 1.0995, "grad_norm": 0.4111202359199524, "learning_rate": 0.0002, "epoch": 1.0854418775210855, "step": 1480}, {"loss": 1.0961, "grad_norm": 0.41452381014823914, "learning_rate": 0.0002, "epoch": 1.0927759442610927, "step": 1490}, {"loss": 1.1068, "grad_norm": 0.3336445093154907, "learning_rate": 0.0002, "epoch": 1.1001100110011002, "step": 1500}, {"loss": 1.0556, "grad_norm": 0.3923407793045044, "learning_rate": 0.0002, "epoch": 1.1074440777411074, "step": 1510}, {"loss": 1.1644, "grad_norm": 0.46215683221817017, "learning_rate": 0.0002, "epoch": 1.1147781444811147, "step": 1520}, {"loss": 1.1133, "grad_norm": 0.3592156767845154, "learning_rate": 0.0002, "epoch": 1.1221122112211221, "step": 1530}, {"loss": 1.0957, "grad_norm": 0.361110657453537, "learning_rate": 0.0002, "epoch": 1.1294462779611294, "step": 1540}, {"loss": 1.1553, "grad_norm": 0.5317131280899048, "learning_rate": 0.0002, "epoch": 1.1367803447011369, "step": 1550}, {"loss": 1.0368, "grad_norm": 0.3882388174533844, "learning_rate": 0.0002, "epoch": 1.1441144114411441, "step": 1560}, {"loss": 1.0805, "grad_norm": 0.3259428143501282, "learning_rate": 0.0002, "epoch": 1.1514484781811514, "step": 1570}, {"loss": 1.1819, "grad_norm": 0.410935640335083, "learning_rate": 0.0002, "epoch": 1.1587825449211588, "step": 1580}, {"loss": 1.1143, "grad_norm": 0.44940185546875, "learning_rate": 0.0002, "epoch": 1.166116611661166, "step": 1590}, {"loss": 1.0334, "grad_norm": 0.5106484293937683, "learning_rate": 0.0002, "epoch": 1.1734506784011733, "step": 1600}, {"loss": 1.2376, "grad_norm": 0.6603665947914124, "learning_rate": 0.0002, "epoch": 1.1807847451411808, "step": 1610}, {"loss": 1.1227, "grad_norm": 0.4799964129924774, "learning_rate": 0.0002, "epoch": 1.188118811881188, "step": 1620}, {"loss": 1.1191, "grad_norm": 0.4389883279800415, "learning_rate": 0.0002, "epoch": 1.1954528786211955, "step": 1630}, {"loss": 1.0667, "grad_norm": 0.4188813269138336, "learning_rate": 0.0002, "epoch": 1.2027869453612028, "step": 1640}, {"loss": 1.0605, "grad_norm": 0.7132157683372498, "learning_rate": 0.0002, "epoch": 1.21012101210121, "step": 1650}, {"loss": 1.0204, "grad_norm": 0.507480263710022, "learning_rate": 0.0002, "epoch": 1.2174550788412175, "step": 1660}, {"loss": 0.9948, "grad_norm": 0.9452332854270935, "learning_rate": 0.0002, "epoch": 1.2247891455812248, "step": 1670}, {"loss": 1.0228, "grad_norm": 0.4121614992618561, "learning_rate": 0.0002, "epoch": 1.2321232123212322, "step": 1680}, {"loss": 1.0366, "grad_norm": 0.34230247139930725, "learning_rate": 0.0002, "epoch": 1.2394572790612395, "step": 1690}, {"loss": 1.1289, "grad_norm": 0.4026208817958832, "learning_rate": 0.0002, "epoch": 1.2467913458012467, "step": 1700}, {"loss": 1.0206, "grad_norm": 0.46673697233200073, "learning_rate": 0.0002, "epoch": 1.2541254125412542, "step": 1710}, {"loss": 1.0827, "grad_norm": 0.38349825143814087, "learning_rate": 0.0002, "epoch": 1.2614594792812615, "step": 1720}, {"loss": 1.0356, "grad_norm": 0.4049997627735138, "learning_rate": 0.0002, "epoch": 1.2687935460212687, "step": 1730}, {"loss": 0.9504, "grad_norm": 0.3417615294456482, "learning_rate": 0.0002, "epoch": 1.2761276127612762, "step": 1740}, {"loss": 1.094, "grad_norm": 0.4277614951133728, "learning_rate": 0.0002, "epoch": 1.2834616795012834, "step": 1750}, {"loss": 0.9938, "grad_norm": 0.5864202976226807, "learning_rate": 0.0002, "epoch": 1.2907957462412907, "step": 1760}, {"loss": 1.1167, "grad_norm": 0.7097493410110474, "learning_rate": 0.0002, "epoch": 1.2981298129812981, "step": 1770}, {"loss": 1.1132, "grad_norm": 0.3145381212234497, "learning_rate": 0.0002, "epoch": 1.3054638797213054, "step": 1780}, {"loss": 1.1099, "grad_norm": 0.5116165280342102, "learning_rate": 0.0002, "epoch": 1.3127979464613129, "step": 1790}, {"loss": 1.0765, "grad_norm": 0.7469736337661743, "learning_rate": 0.0002, "epoch": 1.3201320132013201, "step": 1800}, {"loss": 1.0663, "grad_norm": 0.32272255420684814, "learning_rate": 0.0002, "epoch": 1.3274660799413276, "step": 1810}, {"loss": 0.9887, "grad_norm": 0.3534623086452484, "learning_rate": 0.0002, "epoch": 1.3348001466813348, "step": 1820}, {"loss": 1.1628, "grad_norm": 0.36127907037734985, "learning_rate": 0.0002, "epoch": 1.342134213421342, "step": 1830}, {"loss": 1.0972, "grad_norm": 0.4072401523590088, "learning_rate": 0.0002, "epoch": 1.3494682801613496, "step": 1840}, {"loss": 1.1267, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.3568023469013568, "step": 1850}, {"loss": 1.0173, "grad_norm": 0.412883460521698, "learning_rate": 0.0002, "epoch": 1.364136413641364, "step": 1860}, {"loss": 1.0265, "grad_norm": 0.3735875189304352, "learning_rate": 0.0002, "epoch": 1.3714704803813715, "step": 1870}, {"loss": 1.1061, "grad_norm": 0.39158159494400024, "learning_rate": 0.0002, "epoch": 1.3788045471213788, "step": 1880}, {"loss": 1.0433, "grad_norm": 0.44431769847869873, "learning_rate": 0.0002, "epoch": 1.386138613861386, "step": 1890}, {"loss": 1.0216, "grad_norm": 0.37772801518440247, "learning_rate": 0.0002, "epoch": 1.3934726806013935, "step": 1900}, {"loss": 1.0674, "grad_norm": 0.4056641757488251, "learning_rate": 0.0002, "epoch": 1.4008067473414008, "step": 1910}, {"loss": 1.0256, "grad_norm": 0.41612377762794495, "learning_rate": 0.0002, "epoch": 1.408140814081408, "step": 1920}, {"loss": 1.0467, "grad_norm": 0.41153013706207275, "learning_rate": 0.0002, "epoch": 1.4154748808214155, "step": 1930}, {"loss": 1.1062, "grad_norm": 0.387845516204834, "learning_rate": 0.0002, "epoch": 1.4228089475614227, "step": 1940}, {"loss": 1.1094, "grad_norm": 0.3809587061405182, "learning_rate": 0.0002, "epoch": 1.4301430143014302, "step": 1950}, {"loss": 1.0461, "grad_norm": 0.3625726103782654, "learning_rate": 0.0002, "epoch": 1.4374770810414375, "step": 1960}, {"loss": 0.9983, "grad_norm": 0.5294290781021118, "learning_rate": 0.0002, "epoch": 1.444811147781445, "step": 1970}, {"loss": 1.1114, "grad_norm": 0.39975494146347046, "learning_rate": 0.0002, "epoch": 1.4521452145214522, "step": 1980}, {"loss": 0.9704, "grad_norm": 0.4181167185306549, "learning_rate": 0.0002, "epoch": 1.4594792812614594, "step": 1990}, {"loss": 1.1146, "grad_norm": 0.42001503705978394, "learning_rate": 0.0002, "epoch": 1.466813348001467, "step": 2000}, {"loss": 1.1266, "grad_norm": 0.4877578616142273, "learning_rate": 0.0002, "epoch": 1.4741474147414741, "step": 2010}, {"loss": 1.1012, "grad_norm": 0.4050969183444977, "learning_rate": 0.0002, "epoch": 1.4814814814814814, "step": 2020}, {"loss": 1.0562, "grad_norm": 0.39068883657455444, "learning_rate": 0.0002, "epoch": 1.4888155482214889, "step": 2030}, {"loss": 1.0464, "grad_norm": 0.421282559633255, "learning_rate": 0.0002, "epoch": 1.4961496149614961, "step": 2040}, {"loss": 1.0532, "grad_norm": 0.47092297673225403, "learning_rate": 0.0002, "epoch": 1.5034836817015034, "step": 2050}, {"loss": 0.9348, "grad_norm": 0.39688974618911743, "learning_rate": 0.0002, "epoch": 1.5108177484415108, "step": 2060}, {"loss": 1.08, "grad_norm": 0.5529879331588745, "learning_rate": 0.0002, "epoch": 1.5181518151815183, "step": 2070}, {"loss": 1.1836, "grad_norm": 0.4879782199859619, "learning_rate": 0.0002, "epoch": 1.5254858819215253, "step": 2080}, {"loss": 1.0432, "grad_norm": 0.5517361164093018, "learning_rate": 0.0002, "epoch": 1.5328199486615328, "step": 2090}, {"loss": 1.0433, "grad_norm": 0.44015637040138245, "learning_rate": 0.0002, "epoch": 1.5401540154015403, "step": 2100}, {"loss": 1.1873, "grad_norm": 0.5435167551040649, "learning_rate": 0.0002, "epoch": 1.5474880821415475, "step": 2110}, {"loss": 1.1076, "grad_norm": 0.5714033246040344, "learning_rate": 0.0002, "epoch": 1.5548221488815548, "step": 2120}, {"loss": 1.1107, "grad_norm": 0.31732529401779175, "learning_rate": 0.0002, "epoch": 1.5621562156215623, "step": 2130}, {"loss": 1.0817, "grad_norm": 0.49068278074264526, "learning_rate": 0.0002, "epoch": 1.5694902823615695, "step": 2140}, {"loss": 1.0254, "grad_norm": 0.46851542592048645, "learning_rate": 0.0002, "epoch": 1.5768243491015768, "step": 2150}, {"loss": 1.0623, "grad_norm": 0.5083092451095581, "learning_rate": 0.0002, "epoch": 1.5841584158415842, "step": 2160}, {"loss": 1.0603, "grad_norm": 0.9822936058044434, "learning_rate": 0.0002, "epoch": 1.5914924825815915, "step": 2170}, {"loss": 0.9986, "grad_norm": 0.4575989246368408, "learning_rate": 0.0002, "epoch": 1.5988265493215987, "step": 2180}, {"loss": 1.1292, "grad_norm": 0.47444286942481995, "learning_rate": 0.0002, "epoch": 1.6061606160616062, "step": 2190}, {"loss": 1.0136, "grad_norm": 0.7208226919174194, "learning_rate": 0.0002, "epoch": 1.6134946828016135, "step": 2200}, {"loss": 1.15, "grad_norm": 0.43791481852531433, "learning_rate": 0.0002, "epoch": 1.6208287495416207, "step": 2210}, {"loss": 1.0961, "grad_norm": 0.5245792865753174, "learning_rate": 0.0002, "epoch": 1.6281628162816282, "step": 2220}, {"loss": 0.9957, "grad_norm": 0.39289429783821106, "learning_rate": 0.0002, "epoch": 1.6354968830216357, "step": 2230}, {"loss": 1.133, "grad_norm": 0.6106135845184326, "learning_rate": 0.0002, "epoch": 1.6428309497616427, "step": 2240}, {"loss": 1.0129, "grad_norm": 0.3722580671310425, "learning_rate": 0.0002, "epoch": 1.6501650165016502, "step": 2250}, {"loss": 1.0446, "grad_norm": 0.3649403750896454, "learning_rate": 0.0002, "epoch": 1.6574990832416576, "step": 2260}, {"loss": 1.0037, "grad_norm": 0.46514248847961426, "learning_rate": 0.0002, "epoch": 1.6648331499816649, "step": 2270}, {"loss": 1.0022, "grad_norm": 0.42034927010536194, "learning_rate": 0.0002, "epoch": 1.6721672167216721, "step": 2280}, {"loss": 1.1362, "grad_norm": 0.45202910900115967, "learning_rate": 0.0002, "epoch": 1.6795012834616796, "step": 2290}, {"loss": 1.0866, "grad_norm": 0.36257603764533997, "learning_rate": 0.0002, "epoch": 1.6868353502016868, "step": 2300}, {"loss": 1.0973, "grad_norm": 0.6340323090553284, "learning_rate": 0.0002, "epoch": 1.694169416941694, "step": 2310}, {"loss": 1.0615, "grad_norm": 0.4352878928184509, "learning_rate": 0.0002, "epoch": 1.7015034836817016, "step": 2320}, {"loss": 1.0629, "grad_norm": 0.45029792189598083, "learning_rate": 0.0002, "epoch": 1.7088375504217088, "step": 2330}, {"loss": 0.9621, "grad_norm": 0.3891315758228302, "learning_rate": 0.0002, "epoch": 1.716171617161716, "step": 2340}, {"loss": 0.9779, "grad_norm": 0.35180050134658813, "learning_rate": 0.0002, "epoch": 1.7235056839017235, "step": 2350}, {"loss": 1.0368, "grad_norm": 0.42367449402809143, "learning_rate": 0.0002, "epoch": 1.7308397506417308, "step": 2360}, {"loss": 1.0376, "grad_norm": 0.4553675353527069, "learning_rate": 0.0002, "epoch": 1.738173817381738, "step": 2370}, {"loss": 1.1467, "grad_norm": 0.5944654941558838, "learning_rate": 0.0002, "epoch": 1.7455078841217455, "step": 2380}, {"loss": 1.0548, "grad_norm": 0.3479664623737335, "learning_rate": 0.0002, "epoch": 1.752841950861753, "step": 2390}, {"loss": 1.0798, "grad_norm": 0.3585502505302429, "learning_rate": 0.0002, "epoch": 1.76017601760176, "step": 2400}, {"loss": 1.0983, "grad_norm": 0.4263346493244171, "learning_rate": 0.0002, "epoch": 1.7675100843417675, "step": 2410}, {"loss": 1.054, "grad_norm": 0.5476409196853638, "learning_rate": 0.0002, "epoch": 1.774844151081775, "step": 2420}, {"loss": 1.1615, "grad_norm": 0.3694186508655548, "learning_rate": 0.0002, "epoch": 1.7821782178217822, "step": 2430}, {"loss": 1.1343, "grad_norm": 0.9185658693313599, "learning_rate": 0.0002, "epoch": 1.7895122845617895, "step": 2440}, {"loss": 1.0764, "grad_norm": 0.7171908020973206, "learning_rate": 0.0002, "epoch": 1.796846351301797, "step": 2450}, {"loss": 1.1154, "grad_norm": 0.550658643245697, "learning_rate": 0.0002, "epoch": 1.8041804180418042, "step": 2460}, {"loss": 0.9975, "grad_norm": 0.4075568914413452, "learning_rate": 0.0002, "epoch": 1.8115144847818114, "step": 2470}, {"loss": 1.0935, "grad_norm": 0.3790127635002136, "learning_rate": 0.0002, "epoch": 1.818848551521819, "step": 2480}, {"loss": 0.9839, "grad_norm": 0.3576384484767914, "learning_rate": 0.0002, "epoch": 1.8261826182618262, "step": 2490}, {"loss": 1.1369, "grad_norm": 0.3919370770454407, "learning_rate": 0.0002, "epoch": 1.8335166850018334, "step": 2500}, {"loss": 0.9985, "grad_norm": 0.485083669424057, "learning_rate": 0.0002, "epoch": 1.8408507517418409, "step": 2510}, {"loss": 1.1585, "grad_norm": 0.4564347565174103, "learning_rate": 0.0002, "epoch": 1.8481848184818483, "step": 2520}, {"loss": 1.0944, "grad_norm": 0.3613106608390808, "learning_rate": 0.0002, "epoch": 1.8555188852218554, "step": 2530}, {"loss": 1.0819, "grad_norm": 0.39600759744644165, "learning_rate": 0.0002, "epoch": 1.8628529519618628, "step": 2540}, {"loss": 0.9453, "grad_norm": 1.123499870300293, "learning_rate": 0.0002, "epoch": 1.8701870187018703, "step": 2550}, {"loss": 1.0635, "grad_norm": 0.4612680673599243, "learning_rate": 0.0002, "epoch": 1.8775210854418776, "step": 2560}, {"loss": 1.0087, "grad_norm": 0.42745399475097656, "learning_rate": 0.0002, "epoch": 1.8848551521818848, "step": 2570}, {"loss": 1.0102, "grad_norm": 0.4055580198764801, "learning_rate": 0.0002, "epoch": 1.8921892189218923, "step": 2580}, {"loss": 1.0177, "grad_norm": 0.44174644351005554, "learning_rate": 0.0002, "epoch": 1.8995232856618995, "step": 2590}, {"loss": 0.9886, "grad_norm": 1.0228385925292969, "learning_rate": 0.0002, "epoch": 1.9068573524019068, "step": 2600}, {"loss": 1.0857, "grad_norm": 0.3496396243572235, "learning_rate": 0.0002, "epoch": 1.9141914191419143, "step": 2610}, {"loss": 1.0955, "grad_norm": 0.4191173017024994, "learning_rate": 0.0002, "epoch": 1.9215254858819215, "step": 2620}, {"loss": 1.0943, "grad_norm": 0.6778554916381836, "learning_rate": 0.0002, "epoch": 1.9288595526219288, "step": 2630}, {"loss": 1.0594, "grad_norm": 0.41992834210395813, "learning_rate": 0.0002, "epoch": 1.9361936193619362, "step": 2640}, {"loss": 1.1159, "grad_norm": 0.8760401010513306, "learning_rate": 0.0002, "epoch": 1.9435276861019435, "step": 2650}, {"loss": 1.0379, "grad_norm": 0.44049209356307983, "learning_rate": 0.0002, "epoch": 1.9508617528419507, "step": 2660}, {"loss": 1.1008, "grad_norm": 0.5651928782463074, "learning_rate": 0.0002, "epoch": 1.9581958195819582, "step": 2670}, {"loss": 1.1317, "grad_norm": 0.5292727947235107, "learning_rate": 0.0002, "epoch": 1.9655298863219657, "step": 2680}, {"loss": 1.1328, "grad_norm": 0.6012240648269653, "learning_rate": 0.0002, "epoch": 1.9728639530619727, "step": 2690}, {"loss": 1.0683, "grad_norm": 0.3945149779319763, "learning_rate": 0.0002, "epoch": 1.9801980198019802, "step": 2700}, {"loss": 1.0155, "grad_norm": 0.5732627511024475, "learning_rate": 0.0002, "epoch": 1.9875320865419877, "step": 2710}, {"loss": 0.9857, "grad_norm": 0.3963361084461212, "learning_rate": 0.0002, "epoch": 1.994866153281995, "step": 2720}, {"eval_loss": 1.1534006595611572, "eval_runtime": 32.7541, "eval_samples_per_second": 13.159, "eval_steps_per_second": 1.649, "epoch": 2.0, "step": 2727}, {"loss": 0.9624, "grad_norm": 0.48628315329551697, "learning_rate": 0.0002, "epoch": 2.002200220022002, "step": 2730}, {"loss": 0.9603, "grad_norm": 0.413875013589859, "learning_rate": 0.0002, "epoch": 2.0095342867620096, "step": 2740}, {"loss": 0.965, "grad_norm": 0.4988735616207123, "learning_rate": 0.0002, "epoch": 2.0168683535020167, "step": 2750}, {"loss": 0.9677, "grad_norm": 0.5634812712669373, "learning_rate": 0.0002, "epoch": 2.024202420242024, "step": 2760}, {"loss": 0.9547, "grad_norm": 0.48302653431892395, "learning_rate": 0.0002, "epoch": 2.0315364869820316, "step": 2770}, {"loss": 0.9346, "grad_norm": 0.49914175271987915, "learning_rate": 0.0002, "epoch": 2.038870553722039, "step": 2780}, {"loss": 0.904, "grad_norm": 1.14039945602417, "learning_rate": 0.0002, "epoch": 2.046204620462046, "step": 2790}, {"loss": 0.9588, "grad_norm": 0.6359720826148987, "learning_rate": 0.0002, "epoch": 2.0535386872020536, "step": 2800}, {"loss": 0.9031, "grad_norm": 0.4589158296585083, "learning_rate": 0.0002, "epoch": 2.060872753942061, "step": 2810}, {"loss": 0.9438, "grad_norm": 0.46255481243133545, "learning_rate": 0.0002, "epoch": 2.068206820682068, "step": 2820}, {"loss": 0.9464, "grad_norm": 0.6232137680053711, "learning_rate": 0.0002, "epoch": 2.0755408874220755, "step": 2830}, {"loss": 0.8978, "grad_norm": 0.41042178869247437, "learning_rate": 0.0002, "epoch": 2.082874954162083, "step": 2840}, {"loss": 0.8516, "grad_norm": 0.5334428548812866, "learning_rate": 0.0002, "epoch": 2.09020902090209, "step": 2850}, {"loss": 0.9313, "grad_norm": 0.8270058631896973, "learning_rate": 0.0002, "epoch": 2.0975430876420975, "step": 2860}, {"loss": 1.0064, "grad_norm": 0.6624533534049988, "learning_rate": 0.0002, "epoch": 2.104877154382105, "step": 2870}, {"loss": 0.9196, "grad_norm": 0.5448863506317139, "learning_rate": 0.0002, "epoch": 2.112211221122112, "step": 2880}, {"loss": 0.887, "grad_norm": 0.621482789516449, "learning_rate": 0.0002, "epoch": 2.1195452878621195, "step": 2890}, {"loss": 0.9702, "grad_norm": 0.4556255340576172, "learning_rate": 0.0002, "epoch": 2.126879354602127, "step": 2900}, {"loss": 0.9323, "grad_norm": 0.4620579183101654, "learning_rate": 0.0002, "epoch": 2.1342134213421344, "step": 2910}, {"loss": 0.836, "grad_norm": 0.9602415561676025, "learning_rate": 0.0002, "epoch": 2.1415474880821415, "step": 2920}, {"loss": 0.8826, "grad_norm": 0.587943971157074, "learning_rate": 0.0002, "epoch": 2.148881554822149, "step": 2930}, {"loss": 0.971, "grad_norm": 0.5121372938156128, "learning_rate": 0.0002, "epoch": 2.1562156215621564, "step": 2940}, {"loss": 0.8751, "grad_norm": 0.49424484372138977, "learning_rate": 0.0002, "epoch": 2.1635496883021634, "step": 2950}, {"loss": 0.8674, "grad_norm": 0.6312560439109802, "learning_rate": 0.0002, "epoch": 2.170883755042171, "step": 2960}, {"loss": 0.9791, "grad_norm": 0.5235576629638672, "learning_rate": 0.0002, "epoch": 2.1782178217821784, "step": 2970}, {"loss": 0.9706, "grad_norm": 0.5868439674377441, "learning_rate": 0.0002, "epoch": 2.1855518885221854, "step": 2980}, {"loss": 0.9338, "grad_norm": 0.42302873730659485, "learning_rate": 0.0002, "epoch": 2.192885955262193, "step": 2990}, {"loss": 0.9332, "grad_norm": 0.5097725987434387, "learning_rate": 0.0002, "epoch": 2.2002200220022003, "step": 3000}, {"loss": 0.9239, "grad_norm": 0.5091572403907776, "learning_rate": 0.0002, "epoch": 2.2075540887422074, "step": 3010}, {"loss": 0.8898, "grad_norm": 0.49433162808418274, "learning_rate": 0.0002, "epoch": 2.214888155482215, "step": 3020}, {"loss": 0.9734, "grad_norm": 0.5577368140220642, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3030}, {"loss": 0.9033, "grad_norm": 0.6177583932876587, "learning_rate": 0.0002, "epoch": 2.2295562889622293, "step": 3040}, {"loss": 0.9882, "grad_norm": 0.5256719589233398, "learning_rate": 0.0002, "epoch": 2.236890355702237, "step": 3050}, {"loss": 0.9439, "grad_norm": 0.5001118183135986, "learning_rate": 0.0002, "epoch": 2.2442244224422443, "step": 3060}, {"loss": 0.8718, "grad_norm": 0.5721249580383301, "learning_rate": 0.0002, "epoch": 2.2515584891822513, "step": 3070}, {"loss": 1.0648, "grad_norm": 0.5325384140014648, "learning_rate": 0.0002, "epoch": 2.258892555922259, "step": 3080}, {"loss": 0.9843, "grad_norm": 0.5719189047813416, "learning_rate": 0.0002, "epoch": 2.2662266226622663, "step": 3090}, {"loss": 0.8633, "grad_norm": 0.6337835788726807, "learning_rate": 0.0002, "epoch": 2.2735606894022737, "step": 3100}, {"loss": 0.9962, "grad_norm": 0.5381836891174316, "learning_rate": 0.0002, "epoch": 2.2808947561422808, "step": 3110}, {"loss": 0.8265, "grad_norm": 0.5408531427383423, "learning_rate": 0.0002, "epoch": 2.2882288228822882, "step": 3120}, {"loss": 1.0325, "grad_norm": 0.43705281615257263, "learning_rate": 0.0002, "epoch": 2.2955628896222957, "step": 3130}, {"loss": 0.9388, "grad_norm": 0.6454030275344849, "learning_rate": 0.0002, "epoch": 2.3028969563623027, "step": 3140}, {"loss": 0.954, "grad_norm": 0.686030387878418, "learning_rate": 0.0002, "epoch": 2.31023102310231, "step": 3150}, {"loss": 0.9403, "grad_norm": 0.5123633146286011, "learning_rate": 0.0002, "epoch": 2.3175650898423177, "step": 3160}, {"loss": 0.8834, "grad_norm": 0.842506468296051, "learning_rate": 0.0002, "epoch": 2.3248991565823247, "step": 3170}, {"loss": 1.0497, "grad_norm": 0.5193818807601929, "learning_rate": 0.0002, "epoch": 2.332233223322332, "step": 3180}, {"loss": 0.9473, "grad_norm": 0.5634409189224243, "learning_rate": 0.0002, "epoch": 2.3395672900623397, "step": 3190}, {"loss": 0.8499, "grad_norm": 0.6475534439086914, "learning_rate": 0.0002, "epoch": 2.3469013568023467, "step": 3200}, {"loss": 0.874, "grad_norm": 1.1503914594650269, "learning_rate": 0.0002, "epoch": 2.354235423542354, "step": 3210}, {"loss": 0.9762, "grad_norm": 0.7234905362129211, "learning_rate": 0.0002, "epoch": 2.3615694902823616, "step": 3220}, {"loss": 0.9007, "grad_norm": 0.664903461933136, "learning_rate": 0.0002, "epoch": 2.368903557022369, "step": 3230}, {"loss": 0.9987, "grad_norm": 0.5453006625175476, "learning_rate": 0.0002, "epoch": 2.376237623762376, "step": 3240}, {"loss": 0.9742, "grad_norm": 0.6256654262542725, "learning_rate": 0.0002, "epoch": 2.3835716905023836, "step": 3250}, {"loss": 0.9922, "grad_norm": 0.5166565179824829, "learning_rate": 0.0002, "epoch": 2.390905757242391, "step": 3260}, {"loss": 0.927, "grad_norm": 0.5699098110198975, "learning_rate": 0.0002, "epoch": 2.398239823982398, "step": 3270}, {"loss": 0.8878, "grad_norm": 0.4472540020942688, "learning_rate": 0.0002, "epoch": 2.4055738907224056, "step": 3280}, {"loss": 0.9439, "grad_norm": 0.6790403127670288, "learning_rate": 0.0002, "epoch": 2.412907957462413, "step": 3290}, {"loss": 0.972, "grad_norm": 0.5182185173034668, "learning_rate": 0.0002, "epoch": 2.42024202420242, "step": 3300}, {"loss": 0.9775, "grad_norm": 0.564647912979126, "learning_rate": 0.0002, "epoch": 2.4275760909424275, "step": 3310}, {"loss": 1.072, "grad_norm": 0.5625313520431519, "learning_rate": 0.0002, "epoch": 2.434910157682435, "step": 3320}, {"loss": 0.8798, "grad_norm": 0.7496559619903564, "learning_rate": 0.0002, "epoch": 2.442244224422442, "step": 3330}, {"loss": 0.868, "grad_norm": 0.4779128134250641, "learning_rate": 0.0002, "epoch": 2.4495782911624495, "step": 3340}, {"loss": 1.0316, "grad_norm": 0.578093409538269, "learning_rate": 0.0002, "epoch": 2.456912357902457, "step": 3350}, {"loss": 0.9282, "grad_norm": 0.5456080436706543, "learning_rate": 0.0002, "epoch": 2.4642464246424645, "step": 3360}, {"loss": 0.8409, "grad_norm": 0.4769273102283478, "learning_rate": 0.0002, "epoch": 2.4715804913824715, "step": 3370}, {"loss": 0.9312, "grad_norm": 0.5608189702033997, "learning_rate": 0.0002, "epoch": 2.478914558122479, "step": 3380}, {"loss": 0.9934, "grad_norm": 0.5590165853500366, "learning_rate": 0.0002, "epoch": 2.4862486248624864, "step": 3390}, {"loss": 1.025, "grad_norm": 0.801306962966919, "learning_rate": 0.0002, "epoch": 2.4935826916024935, "step": 3400}, {"loss": 0.9049, "grad_norm": 0.6045624613761902, "learning_rate": 0.0002, "epoch": 2.500916758342501, "step": 3410}, {"loss": 0.944, "grad_norm": 0.5735858082771301, "learning_rate": 0.0002, "epoch": 2.5082508250825084, "step": 3420}, {"loss": 0.9846, "grad_norm": 0.6827309131622314, "learning_rate": 0.0002, "epoch": 2.5155848918225154, "step": 3430}, {"loss": 0.9789, "grad_norm": 0.5702602863311768, "learning_rate": 0.0002, "epoch": 2.522918958562523, "step": 3440}, {"loss": 0.9127, "grad_norm": 0.6674721240997314, "learning_rate": 0.0002, "epoch": 2.5302530253025304, "step": 3450}, {"loss": 0.914, "grad_norm": 0.5635907649993896, "learning_rate": 0.0002, "epoch": 2.5375870920425374, "step": 3460}, {"loss": 0.8398, "grad_norm": 0.42737770080566406, "learning_rate": 0.0002, "epoch": 2.544921158782545, "step": 3470}, {"loss": 0.9474, "grad_norm": 0.6720691919326782, "learning_rate": 0.0002, "epoch": 2.5522552255225524, "step": 3480}, {"loss": 0.8637, "grad_norm": 0.8917084336280823, "learning_rate": 0.0002, "epoch": 2.55958929226256, "step": 3490}, {"loss": 0.9257, "grad_norm": 0.5134549140930176, "learning_rate": 0.0002, "epoch": 2.566923359002567, "step": 3500}, {"loss": 0.9362, "grad_norm": 0.4951367974281311, "learning_rate": 0.0002, "epoch": 2.5742574257425743, "step": 3510}, {"loss": 0.9184, "grad_norm": 0.9438204765319824, "learning_rate": 0.0002, "epoch": 2.5815914924825814, "step": 3520}, {"loss": 0.8939, "grad_norm": 0.6024714708328247, "learning_rate": 0.0002, "epoch": 2.588925559222589, "step": 3530}, {"loss": 0.9298, "grad_norm": 0.5248535871505737, "learning_rate": 0.0002, "epoch": 2.5962596259625963, "step": 3540}, {"loss": 0.941, "grad_norm": 0.8677568435668945, "learning_rate": 0.0002, "epoch": 2.6035936927026038, "step": 3550}, {"loss": 0.9253, "grad_norm": 0.82008296251297, "learning_rate": 0.0002, "epoch": 2.610927759442611, "step": 3560}, {"loss": 0.8429, "grad_norm": 0.4724634885787964, "learning_rate": 0.0002, "epoch": 2.6182618261826183, "step": 3570}, {"loss": 0.9058, "grad_norm": 0.5434244275093079, "learning_rate": 0.0002, "epoch": 2.6255958929226257, "step": 3580}, {"loss": 0.9379, "grad_norm": 0.4948740005493164, "learning_rate": 0.0002, "epoch": 2.6329299596626328, "step": 3590}, {"loss": 0.8718, "grad_norm": 0.42109328508377075, "learning_rate": 0.0002, "epoch": 2.6402640264026402, "step": 3600}, {"loss": 0.9809, "grad_norm": 0.7979786396026611, "learning_rate": 0.0002, "epoch": 2.6475980931426477, "step": 3610}, {"loss": 0.9229, "grad_norm": 0.6345919370651245, "learning_rate": 0.0002, "epoch": 2.654932159882655, "step": 3620}, {"loss": 0.8506, "grad_norm": 0.4971671402454376, "learning_rate": 0.0002, "epoch": 2.662266226622662, "step": 3630}, {"loss": 0.8054, "grad_norm": 0.6467748284339905, "learning_rate": 0.0002, "epoch": 2.6696002933626697, "step": 3640}, {"loss": 0.9277, "grad_norm": 0.4240160286426544, "learning_rate": 0.0002, "epoch": 2.6769343601026767, "step": 3650}, {"loss": 0.8213, "grad_norm": 0.5179754495620728, "learning_rate": 0.0002, "epoch": 2.684268426842684, "step": 3660}, {"loss": 0.9221, "grad_norm": 0.754012405872345, "learning_rate": 0.0002, "epoch": 2.6916024935826917, "step": 3670}, {"loss": 0.9194, "grad_norm": 0.5141299962997437, "learning_rate": 0.0002, "epoch": 2.698936560322699, "step": 3680}, {"loss": 0.9495, "grad_norm": 0.5737819075584412, "learning_rate": 0.0002, "epoch": 2.706270627062706, "step": 3690}, {"loss": 1.0162, "grad_norm": 0.5887577533721924, "learning_rate": 0.0002, "epoch": 2.7136046938027136, "step": 3700}, {"loss": 0.9169, "grad_norm": 0.6740471720695496, "learning_rate": 0.0002, "epoch": 2.720938760542721, "step": 3710}, {"loss": 0.9297, "grad_norm": 0.5879453420639038, "learning_rate": 0.0002, "epoch": 2.728272827282728, "step": 3720}, {"loss": 0.9358, "grad_norm": 0.4858354926109314, "learning_rate": 0.0002, "epoch": 2.7356068940227356, "step": 3730}, {"loss": 0.9308, "grad_norm": 0.5489001870155334, "learning_rate": 0.0002, "epoch": 2.742940960762743, "step": 3740}, {"loss": 0.894, "grad_norm": 0.8187092542648315, "learning_rate": 0.0002, "epoch": 2.7502750275027505, "step": 3750}, {"loss": 0.8954, "grad_norm": 0.5666626691818237, "learning_rate": 0.0002, "epoch": 2.7576090942427576, "step": 3760}, {"loss": 1.0059, "grad_norm": 0.5377066135406494, "learning_rate": 0.0002, "epoch": 2.764943160982765, "step": 3770}, {"loss": 0.9132, "grad_norm": 0.566330075263977, "learning_rate": 0.0002, "epoch": 2.772277227722772, "step": 3780}, {"loss": 0.9415, "grad_norm": 0.5522832870483398, "learning_rate": 0.0002, "epoch": 2.7796112944627795, "step": 3790}, {"loss": 0.8816, "grad_norm": 0.5668695569038391, "learning_rate": 0.0002, "epoch": 2.786945361202787, "step": 3800}, {"loss": 0.8885, "grad_norm": 0.7566602826118469, "learning_rate": 0.0002, "epoch": 2.7942794279427945, "step": 3810}, {"loss": 0.8598, "grad_norm": 0.5603684782981873, "learning_rate": 0.0002, "epoch": 2.8016134946828015, "step": 3820}, {"loss": 0.9602, "grad_norm": 0.49122217297554016, "learning_rate": 0.0002, "epoch": 2.808947561422809, "step": 3830}, {"loss": 0.9738, "grad_norm": 0.6798251867294312, "learning_rate": 0.0002, "epoch": 2.816281628162816, "step": 3840}, {"loss": 0.9533, "grad_norm": 0.6097991466522217, "learning_rate": 0.0002, "epoch": 2.8236156949028235, "step": 3850}, {"loss": 0.8672, "grad_norm": 0.6675726175308228, "learning_rate": 0.0002, "epoch": 2.830949761642831, "step": 3860}, {"loss": 0.9324, "grad_norm": 0.9223952889442444, "learning_rate": 0.0002, "epoch": 2.8382838283828384, "step": 3870}, {"loss": 0.8767, "grad_norm": 0.6020799875259399, "learning_rate": 0.0002, "epoch": 2.8456178951228455, "step": 3880}, {"loss": 0.9148, "grad_norm": 0.5206381678581238, "learning_rate": 0.0002, "epoch": 2.852951961862853, "step": 3890}, {"loss": 0.9479, "grad_norm": 0.6268777251243591, "learning_rate": 0.0002, "epoch": 2.8602860286028604, "step": 3900}, {"loss": 0.9409, "grad_norm": 1.1583497524261475, "learning_rate": 0.0002, "epoch": 2.8676200953428674, "step": 3910}, {"loss": 0.895, "grad_norm": 0.7263903021812439, "learning_rate": 0.0002, "epoch": 2.874954162082875, "step": 3920}, {"loss": 0.8786, "grad_norm": 0.5369910001754761, "learning_rate": 0.0002, "epoch": 2.8822882288228824, "step": 3930}, {"loss": 1.0015, "grad_norm": 0.7298350930213928, "learning_rate": 0.0002, "epoch": 2.88962229556289, "step": 3940}, {"loss": 0.979, "grad_norm": 0.577012836933136, "learning_rate": 0.0002, "epoch": 2.896956362302897, "step": 3950}, {"loss": 0.9716, "grad_norm": 0.5859594345092773, "learning_rate": 0.0002, "epoch": 2.9042904290429044, "step": 3960}, {"loss": 0.8772, "grad_norm": 0.47176122665405273, "learning_rate": 0.0002, "epoch": 2.9116244957829114, "step": 3970}, {"loss": 0.8997, "grad_norm": 0.9699620604515076, "learning_rate": 0.0002, "epoch": 2.918958562522919, "step": 3980}, {"loss": 0.9057, "grad_norm": 0.7908747792243958, "learning_rate": 0.0002, "epoch": 2.9262926292629263, "step": 3990}, {"loss": 0.9462, "grad_norm": 0.5777379274368286, "learning_rate": 0.0002, "epoch": 2.933626696002934, "step": 4000}, {"loss": 0.9358, "grad_norm": 0.599288284778595, "learning_rate": 0.0002, "epoch": 2.940960762742941, "step": 4010}, {"loss": 0.9812, "grad_norm": 0.5232274532318115, "learning_rate": 0.0002, "epoch": 2.9482948294829483, "step": 4020}, {"loss": 0.96, "grad_norm": 0.6395137310028076, "learning_rate": 0.0002, "epoch": 2.9556288962229558, "step": 4030}, {"loss": 0.9813, "grad_norm": 0.589260458946228, "learning_rate": 0.0002, "epoch": 2.962962962962963, "step": 4040}, {"loss": 0.9541, "grad_norm": 0.5699581503868103, "learning_rate": 0.0002, "epoch": 2.9702970297029703, "step": 4050}, {"loss": 0.9585, "grad_norm": 0.528468132019043, "learning_rate": 0.0002, "epoch": 2.9776310964429777, "step": 4060}, {"loss": 0.9164, "grad_norm": 0.4804670512676239, "learning_rate": 0.0002, "epoch": 2.984965163182985, "step": 4070}, {"loss": 0.9771, "grad_norm": 1.1918889284133911, "learning_rate": 0.0002, "epoch": 2.9922992299229922, "step": 4080}, {"loss": 0.9178, "grad_norm": 0.5479103326797485, "learning_rate": 0.0002, "epoch": 2.9996332966629997, "step": 4090}, {"eval_loss": 1.1642853021621704, "eval_runtime": 32.7511, "eval_samples_per_second": 13.16, "eval_steps_per_second": 1.649, "epoch": 2.9996332966629997, "step": 4090}, {"loss": 0.7981, "grad_norm": 0.7430027723312378, "learning_rate": 0.0002, "epoch": 3.006967363403007, "step": 4100}, {"loss": 0.7871, "grad_norm": 0.6293647289276123, "learning_rate": 0.0002, "epoch": 3.014301430143014, "step": 4110}, {"loss": 0.78, "grad_norm": 0.6191329956054688, "learning_rate": 0.0002, "epoch": 3.0216354968830217, "step": 4120}, {"loss": 0.7618, "grad_norm": 0.7959313988685608, "learning_rate": 0.0002, "epoch": 3.028969563623029, "step": 4130}, {"loss": 0.8039, "grad_norm": 0.5956351161003113, "learning_rate": 0.0002, "epoch": 3.036303630363036, "step": 4140}, {"loss": 0.7477, "grad_norm": 0.670383632183075, "learning_rate": 0.0002, "epoch": 3.0436376971030437, "step": 4150}, {"loss": 0.7984, "grad_norm": 0.6414518356323242, "learning_rate": 0.0002, "epoch": 3.050971763843051, "step": 4160}, {"loss": 0.7369, "grad_norm": 0.7928852438926697, "learning_rate": 0.0002, "epoch": 3.058305830583058, "step": 4170}, {"loss": 0.7914, "grad_norm": 0.6211121082305908, "learning_rate": 0.0002, "epoch": 3.0656398973230656, "step": 4180}, {"loss": 0.7365, "grad_norm": 0.6237057447433472, "learning_rate": 0.0002, "epoch": 3.072973964063073, "step": 4190}, {"loss": 0.702, "grad_norm": 0.6522233486175537, "learning_rate": 0.0002, "epoch": 3.08030803080308, "step": 4200}, {"loss": 0.7646, "grad_norm": 0.9396848678588867, "learning_rate": 0.0002, "epoch": 3.0876420975430876, "step": 4210}, {"loss": 0.7559, "grad_norm": 0.8003010749816895, "learning_rate": 0.0002, "epoch": 3.094976164283095, "step": 4220}, {"loss": 0.711, "grad_norm": 0.6733810305595398, "learning_rate": 0.0002, "epoch": 3.102310231023102, "step": 4230}, {"loss": 0.696, "grad_norm": 0.6365828514099121, "learning_rate": 0.0002, "epoch": 3.1096442977631096, "step": 4240}, {"loss": 0.8362, "grad_norm": 1.0805548429489136, "learning_rate": 0.0002, "epoch": 3.116978364503117, "step": 4250}, {"loss": 0.7651, "grad_norm": 0.7262141108512878, "learning_rate": 0.0002, "epoch": 3.1243124312431245, "step": 4260}, {"loss": 0.7304, "grad_norm": 0.5500539541244507, "learning_rate": 0.0002, "epoch": 3.1316464979831315, "step": 4270}, {"loss": 0.7721, "grad_norm": 0.793912947177887, "learning_rate": 0.0002, "epoch": 3.138980564723139, "step": 4280}, {"loss": 0.7708, "grad_norm": 1.2540518045425415, "learning_rate": 0.0002, "epoch": 3.1463146314631465, "step": 4290}, {"loss": 0.782, "grad_norm": 0.7020077705383301, "learning_rate": 0.0002, "epoch": 3.1536486982031535, "step": 4300}, {"loss": 0.7253, "grad_norm": 0.5111123323440552, "learning_rate": 0.0002, "epoch": 3.160982764943161, "step": 4310}, {"loss": 0.8159, "grad_norm": 0.7172090411186218, "learning_rate": 0.0002, "epoch": 3.1683168316831685, "step": 4320}, {"loss": 0.6962, "grad_norm": 0.6343168616294861, "learning_rate": 0.0002, "epoch": 3.1756508984231755, "step": 4330}, {"loss": 0.7938, "grad_norm": 0.9563672542572021, "learning_rate": 0.0002, "epoch": 3.182984965163183, "step": 4340}, {"loss": 0.7385, "grad_norm": 1.0225574970245361, "learning_rate": 0.0002, "epoch": 3.1903190319031904, "step": 4350}, {"loss": 0.8652, "grad_norm": 1.1633386611938477, "learning_rate": 0.0002, "epoch": 3.1976530986431975, "step": 4360}, {"loss": 0.7259, "grad_norm": 0.8915148973464966, "learning_rate": 0.0002, "epoch": 3.204987165383205, "step": 4370}, {"loss": 0.8061, "grad_norm": 0.9156812429428101, "learning_rate": 0.0002, "epoch": 3.2123212321232124, "step": 4380}, {"loss": 0.8189, "grad_norm": 0.6363258957862854, "learning_rate": 0.0002, "epoch": 3.21965529886322, "step": 4390}, {"loss": 0.7996, "grad_norm": 0.579099178314209, "learning_rate": 0.0002, "epoch": 3.226989365603227, "step": 4400}, {"loss": 0.8592, "grad_norm": 0.8778146505355835, "learning_rate": 0.0002, "epoch": 3.2343234323432344, "step": 4410}, {"loss": 0.8281, "grad_norm": 0.8356770873069763, "learning_rate": 0.0002, "epoch": 3.241657499083242, "step": 4420}, {"loss": 0.8484, "grad_norm": 0.702032208442688, "learning_rate": 0.0002, "epoch": 3.248991565823249, "step": 4430}, {"loss": 0.7227, "grad_norm": 0.6386539340019226, "learning_rate": 0.0002, "epoch": 3.2563256325632564, "step": 4440}, {"loss": 0.8374, "grad_norm": 0.7008408904075623, "learning_rate": 0.0002, "epoch": 3.263659699303264, "step": 4450}, {"loss": 0.7572, "grad_norm": 0.9556332230567932, "learning_rate": 0.0002, "epoch": 3.270993766043271, "step": 4460}, {"loss": 0.743, "grad_norm": 0.5667835474014282, "learning_rate": 0.0002, "epoch": 3.2783278327832783, "step": 4470}, {"loss": 0.8152, "grad_norm": 0.8239172697067261, "learning_rate": 0.0002, "epoch": 3.285661899523286, "step": 4480}, {"loss": 0.756, "grad_norm": 0.7045050859451294, "learning_rate": 0.0002, "epoch": 3.292995966263293, "step": 4490}, {"loss": 0.7655, "grad_norm": 0.7131434082984924, "learning_rate": 0.0002, "epoch": 3.3003300330033003, "step": 4500}, {"loss": 0.836, "grad_norm": 0.6924910545349121, "learning_rate": 0.0002, "epoch": 3.3076640997433078, "step": 4510}, {"loss": 0.736, "grad_norm": 0.8945356607437134, "learning_rate": 0.0002, "epoch": 3.3149981664833152, "step": 4520}, {"loss": 0.7575, "grad_norm": 0.6546903252601624, "learning_rate": 0.0002, "epoch": 3.3223322332233223, "step": 4530}, {"loss": 0.7893, "grad_norm": 0.8206679224967957, "learning_rate": 0.0002, "epoch": 3.3296662999633297, "step": 4540}, {"loss": 0.7502, "grad_norm": 0.6482203602790833, "learning_rate": 0.0002, "epoch": 3.3370003667033368, "step": 4550}, {"loss": 0.8172, "grad_norm": 0.7558760046958923, "learning_rate": 0.0002, "epoch": 3.3443344334433442, "step": 4560}, {"loss": 0.744, "grad_norm": 0.7794756889343262, "learning_rate": 0.0002, "epoch": 3.3516685001833517, "step": 4570}, {"loss": 0.7385, "grad_norm": 0.7382805943489075, "learning_rate": 0.0002, "epoch": 3.359002566923359, "step": 4580}, {"loss": 0.8511, "grad_norm": 0.5912511944770813, "learning_rate": 0.0002, "epoch": 3.366336633663366, "step": 4590}, {"loss": 0.8272, "grad_norm": 0.7444885969161987, "learning_rate": 0.0002, "epoch": 3.3736707004033737, "step": 4600}, {"loss": 0.7927, "grad_norm": 0.7354922890663147, "learning_rate": 0.0002, "epoch": 3.381004767143381, "step": 4610}, {"loss": 0.7183, "grad_norm": 0.7685934901237488, "learning_rate": 0.0002, "epoch": 3.388338833883388, "step": 4620}, {"loss": 0.7436, "grad_norm": 0.61041259765625, "learning_rate": 0.0002, "epoch": 3.3956729006233957, "step": 4630}, {"loss": 0.7661, "grad_norm": 0.6820451021194458, "learning_rate": 0.0002, "epoch": 3.403006967363403, "step": 4640}, {"loss": 0.8796, "grad_norm": 0.5819534063339233, "learning_rate": 0.0002, "epoch": 3.41034103410341, "step": 4650}, {"loss": 0.7314, "grad_norm": 0.705410897731781, "learning_rate": 0.0002, "epoch": 3.4176751008434176, "step": 4660}, {"loss": 0.7901, "grad_norm": 0.8052892088890076, "learning_rate": 0.0002, "epoch": 3.425009167583425, "step": 4670}, {"loss": 0.7298, "grad_norm": 0.7746483087539673, "learning_rate": 0.0002, "epoch": 3.432343234323432, "step": 4680}, {"loss": 0.7976, "grad_norm": 0.7713689804077148, "learning_rate": 0.0002, "epoch": 3.4396773010634396, "step": 4690}, {"loss": 0.7427, "grad_norm": 0.810371994972229, "learning_rate": 0.0002, "epoch": 3.447011367803447, "step": 4700}, {"loss": 0.7594, "grad_norm": 0.7702969312667847, "learning_rate": 0.0002, "epoch": 3.4543454345434546, "step": 4710}, {"loss": 0.7957, "grad_norm": 0.7069268822669983, "learning_rate": 0.0002, "epoch": 3.4616795012834616, "step": 4720}, {"loss": 0.8199, "grad_norm": 0.7640359401702881, "learning_rate": 0.0002, "epoch": 3.469013568023469, "step": 4730}, {"loss": 0.6875, "grad_norm": 0.8661707639694214, "learning_rate": 0.0002, "epoch": 3.4763476347634765, "step": 4740}, {"loss": 0.8528, "grad_norm": 0.9970282912254333, "learning_rate": 0.0002, "epoch": 3.4836817015034836, "step": 4750}, {"loss": 0.8462, "grad_norm": 0.5824355483055115, "learning_rate": 0.0002, "epoch": 3.491015768243491, "step": 4760}, {"loss": 0.851, "grad_norm": 1.3072649240493774, "learning_rate": 0.0002, "epoch": 3.4983498349834985, "step": 4770}, {"loss": 0.9101, "grad_norm": 0.873978316783905, "learning_rate": 0.0002, "epoch": 3.5056839017235055, "step": 4780}, {"loss": 0.7403, "grad_norm": 0.5526657104492188, "learning_rate": 0.0002, "epoch": 3.513017968463513, "step": 4790}, {"loss": 0.7921, "grad_norm": 0.790894627571106, "learning_rate": 0.0002, "epoch": 3.5203520352035205, "step": 4800}, {"loss": 0.831, "grad_norm": 0.8119630217552185, "learning_rate": 0.0002, "epoch": 3.5276861019435275, "step": 4810}, {"loss": 0.7351, "grad_norm": 0.633212149143219, "learning_rate": 0.0002, "epoch": 3.535020168683535, "step": 4820}, {"loss": 0.8505, "grad_norm": 0.703029990196228, "learning_rate": 0.0002, "epoch": 3.5423542354235424, "step": 4830}, {"loss": 0.7204, "grad_norm": 0.7603771686553955, "learning_rate": 0.0002, "epoch": 3.54968830216355, "step": 4840}, {"loss": 0.8868, "grad_norm": 0.6260480880737305, "learning_rate": 0.0002, "epoch": 3.557022368903557, "step": 4850}, {"loss": 0.8137, "grad_norm": 0.8203664422035217, "learning_rate": 0.0002, "epoch": 3.5643564356435644, "step": 4860}, {"loss": 0.8821, "grad_norm": 0.7793813347816467, "learning_rate": 0.0002, "epoch": 3.5716905023835714, "step": 4870}, {"loss": 0.8164, "grad_norm": 0.7667397260665894, "learning_rate": 0.0002, "epoch": 3.579024569123579, "step": 4880}, {"loss": 0.7597, "grad_norm": 0.8198829889297485, "learning_rate": 0.0002, "epoch": 3.5863586358635864, "step": 4890}, {"loss": 0.7027, "grad_norm": 0.7689233422279358, "learning_rate": 0.0002, "epoch": 3.593692702603594, "step": 4900}, {"loss": 0.804, "grad_norm": 0.7870983481407166, "learning_rate": 0.0002, "epoch": 3.601026769343601, "step": 4910}, {"loss": 0.8269, "grad_norm": 0.8133853077888489, "learning_rate": 0.0002, "epoch": 3.6083608360836084, "step": 4920}, {"loss": 0.8515, "grad_norm": 1.308401346206665, "learning_rate": 0.0002, "epoch": 3.615694902823616, "step": 4930}, {"loss": 0.8494, "grad_norm": 0.7131121754646301, "learning_rate": 0.0002, "epoch": 3.623028969563623, "step": 4940}, {"loss": 0.7235, "grad_norm": 0.6825910210609436, "learning_rate": 0.0002, "epoch": 3.6303630363036303, "step": 4950}, {"loss": 0.7824, "grad_norm": 0.7254678606987, "learning_rate": 0.0002, "epoch": 3.637697103043638, "step": 4960}, {"loss": 0.7983, "grad_norm": 0.8045085072517395, "learning_rate": 0.0002, "epoch": 3.6450311697836453, "step": 4970}, {"loss": 0.8223, "grad_norm": 0.6991777420043945, "learning_rate": 0.0002, "epoch": 3.6523652365236523, "step": 4980}, {"loss": 0.7806, "grad_norm": 0.7804713249206543, "learning_rate": 0.0002, "epoch": 3.6596993032636598, "step": 4990}, {"loss": 0.8402, "grad_norm": 0.8525708317756653, "learning_rate": 0.0002, "epoch": 3.667033370003667, "step": 5000}, {"loss": 0.8496, "grad_norm": 0.7959994673728943, "learning_rate": 0.0002, "epoch": 3.6743674367436743, "step": 5010}, {"loss": 0.8022, "grad_norm": 0.8103628158569336, "learning_rate": 0.0002, "epoch": 3.6817015034836817, "step": 5020}, {"loss": 0.7376, "grad_norm": 0.7517836093902588, "learning_rate": 0.0002, "epoch": 3.689035570223689, "step": 5030}, {"loss": 0.8375, "grad_norm": 0.6878514289855957, "learning_rate": 0.0002, "epoch": 3.6963696369636962, "step": 5040}, {"loss": 0.7998, "grad_norm": 1.2371820211410522, "learning_rate": 0.0002, "epoch": 3.7037037037037037, "step": 5050}, {"loss": 0.6941, "grad_norm": 0.6567103862762451, "learning_rate": 0.0002, "epoch": 3.711037770443711, "step": 5060}, {"loss": 0.8465, "grad_norm": 1.1254922151565552, "learning_rate": 0.0002, "epoch": 3.718371837183718, "step": 5070}, {"loss": 0.8365, "grad_norm": 0.6796132326126099, "learning_rate": 0.0002, "epoch": 3.7257059039237257, "step": 5080}, {"loss": 0.7818, "grad_norm": 0.7285300493240356, "learning_rate": 0.0002, "epoch": 3.733039970663733, "step": 5090}, {"loss": 0.8581, "grad_norm": 0.8931500911712646, "learning_rate": 0.0002, "epoch": 3.7403740374037406, "step": 5100}, {"loss": 0.8181, "grad_norm": 0.6256856918334961, "learning_rate": 0.0002, "epoch": 3.7477081041437477, "step": 5110}, {"loss": 0.743, "grad_norm": 0.79310142993927, "learning_rate": 0.0002, "epoch": 3.755042170883755, "step": 5120}, {"loss": 0.8235, "grad_norm": 0.6594041585922241, "learning_rate": 0.0002, "epoch": 3.762376237623762, "step": 5130}, {"loss": 0.6925, "grad_norm": 0.7029327750205994, "learning_rate": 0.0002, "epoch": 3.7697103043637696, "step": 5140}, {"loss": 0.7457, "grad_norm": 0.5880070328712463, "learning_rate": 0.0002, "epoch": 3.777044371103777, "step": 5150}, {"loss": 0.8716, "grad_norm": 0.7578945159912109, "learning_rate": 0.0002, "epoch": 3.7843784378437846, "step": 5160}, {"loss": 0.8819, "grad_norm": 0.8276378512382507, "learning_rate": 0.0002, "epoch": 3.7917125045837916, "step": 5170}, {"loss": 0.7559, "grad_norm": 0.7627953886985779, "learning_rate": 0.0002, "epoch": 3.799046571323799, "step": 5180}, {"loss": 0.7665, "grad_norm": 0.8169086575508118, "learning_rate": 0.0002, "epoch": 3.806380638063806, "step": 5190}, {"loss": 0.761, "grad_norm": 0.6605030298233032, "learning_rate": 0.0002, "epoch": 3.8137147048038136, "step": 5200}, {"loss": 0.8804, "grad_norm": 0.5837286114692688, "learning_rate": 0.0002, "epoch": 3.821048771543821, "step": 5210}, {"loss": 0.8369, "grad_norm": 1.2422157526016235, "learning_rate": 0.0002, "epoch": 3.8283828382838285, "step": 5220}, {"loss": 0.8431, "grad_norm": 0.6589220762252808, "learning_rate": 0.0002, "epoch": 3.8357169050238356, "step": 5230}, {"loss": 0.7686, "grad_norm": 0.8567556142807007, "learning_rate": 0.0002, "epoch": 3.843050971763843, "step": 5240}, {"loss": 0.8652, "grad_norm": 0.6490627527236938, "learning_rate": 0.0002, "epoch": 3.8503850385038505, "step": 5250}, {"loss": 0.7386, "grad_norm": 0.620232880115509, "learning_rate": 0.0002, "epoch": 3.8577191052438575, "step": 5260}, {"loss": 0.9192, "grad_norm": 0.7685128450393677, "learning_rate": 0.0002, "epoch": 3.865053171983865, "step": 5270}, {"loss": 0.872, "grad_norm": 0.8113296627998352, "learning_rate": 0.0002, "epoch": 3.8723872387238725, "step": 5280}, {"loss": 0.7156, "grad_norm": 0.8092675805091858, "learning_rate": 0.0002, "epoch": 3.87972130546388, "step": 5290}, {"loss": 0.7325, "grad_norm": 0.583570122718811, "learning_rate": 0.0002, "epoch": 3.887055372203887, "step": 5300}, {"loss": 0.9333, "grad_norm": 1.712363600730896, "learning_rate": 0.0002, "epoch": 3.8943894389438944, "step": 5310}, {"loss": 0.7537, "grad_norm": 0.6673534512519836, "learning_rate": 0.0002, "epoch": 3.9017235056839015, "step": 5320}, {"loss": 0.7035, "grad_norm": 1.9770312309265137, "learning_rate": 0.0002, "epoch": 3.909057572423909, "step": 5330}, {"loss": 0.8793, "grad_norm": 0.6430999636650085, "learning_rate": 0.0002, "epoch": 3.9163916391639164, "step": 5340}, {"loss": 0.839, "grad_norm": 1.0159571170806885, "learning_rate": 0.0002, "epoch": 3.923725705903924, "step": 5350}, {"loss": 0.9332, "grad_norm": 0.8607584834098816, "learning_rate": 0.0002, "epoch": 3.931059772643931, "step": 5360}, {"loss": 0.7261, "grad_norm": 0.6967900991439819, "learning_rate": 0.0002, "epoch": 3.9383938393839384, "step": 5370}, {"loss": 0.8456, "grad_norm": 0.7683077454566956, "learning_rate": 0.0002, "epoch": 3.945727906123946, "step": 5380}, {"loss": 0.7682, "grad_norm": 0.6805762648582458, "learning_rate": 0.0002, "epoch": 3.953061972863953, "step": 5390}, {"loss": 0.7746, "grad_norm": 0.7033619284629822, "learning_rate": 0.0002, "epoch": 3.9603960396039604, "step": 5400}, {"loss": 0.8393, "grad_norm": 0.966112494468689, "learning_rate": 0.0002, "epoch": 3.967730106343968, "step": 5410}, {"loss": 0.8316, "grad_norm": 0.8467881083488464, "learning_rate": 0.0002, "epoch": 3.9750641730839753, "step": 5420}, {"loss": 0.8084, "grad_norm": 0.8005317449569702, "learning_rate": 0.0002, "epoch": 3.9823982398239823, "step": 5430}, {"loss": 0.7168, "grad_norm": 1.1615241765975952, "learning_rate": 0.0002, "epoch": 3.98973230656399, "step": 5440}, {"loss": 0.8263, "grad_norm": 0.6121614575386047, "learning_rate": 0.0002, "epoch": 3.997066373303997, "step": 5450}, {"eval_loss": 1.1834222078323364, "eval_runtime": 32.7569, "eval_samples_per_second": 13.158, "eval_steps_per_second": 1.649, "epoch": 4.0, "step": 5454}, {"loss": 0.7267, "grad_norm": 0.6055727005004883, "learning_rate": 0.0002, "epoch": 4.004400440044004, "step": 5460}, {"loss": 0.5766, "grad_norm": 0.8232647180557251, "learning_rate": 0.0002, "epoch": 4.011734506784012, "step": 5470}, {"loss": 0.6489, "grad_norm": 0.7739192247390747, "learning_rate": 0.0002, "epoch": 4.019068573524019, "step": 5480}, {"loss": 0.5978, "grad_norm": 0.6264950633049011, "learning_rate": 0.0002, "epoch": 4.026402640264027, "step": 5490}, {"loss": 0.6392, "grad_norm": 1.4798702001571655, "learning_rate": 0.0002, "epoch": 4.033736707004033, "step": 5500}, {"loss": 0.6143, "grad_norm": 0.9538470506668091, "learning_rate": 0.0002, "epoch": 4.041070773744041, "step": 5510}, {"loss": 0.6056, "grad_norm": 0.834561288356781, "learning_rate": 0.0002, "epoch": 4.048404840484048, "step": 5520}, {"loss": 0.6077, "grad_norm": 0.6407850384712219, "learning_rate": 0.0002, "epoch": 4.055738907224056, "step": 5530}, {"loss": 0.6733, "grad_norm": 0.9035961627960205, "learning_rate": 0.0002, "epoch": 4.063072973964063, "step": 5540}, {"loss": 0.5854, "grad_norm": 0.842812716960907, "learning_rate": 0.0002, "epoch": 4.070407040704071, "step": 5550}, {"loss": 0.654, "grad_norm": 0.8197882175445557, "learning_rate": 0.0002, "epoch": 4.077741107444078, "step": 5560}, {"loss": 0.5919, "grad_norm": 0.8652673959732056, "learning_rate": 0.0002, "epoch": 4.085075174184085, "step": 5570}, {"loss": 0.6188, "grad_norm": 0.8048318028450012, "learning_rate": 0.0002, "epoch": 4.092409240924092, "step": 5580}, {"loss": 0.6487, "grad_norm": 0.9604969024658203, "learning_rate": 0.0002, "epoch": 4.0997433076641, "step": 5590}, {"loss": 0.6356, "grad_norm": 1.244756817817688, "learning_rate": 0.0002, "epoch": 4.107077374404107, "step": 5600}, {"loss": 0.6489, "grad_norm": 0.7975269556045532, "learning_rate": 0.0002, "epoch": 4.114411441144115, "step": 5610}, {"loss": 0.6445, "grad_norm": 0.6130099296569824, "learning_rate": 0.0002, "epoch": 4.121745507884122, "step": 5620}, {"loss": 0.6024, "grad_norm": 0.7793202996253967, "learning_rate": 0.0002, "epoch": 4.129079574624129, "step": 5630}, {"loss": 0.5723, "grad_norm": 1.187238335609436, "learning_rate": 0.0002, "epoch": 4.136413641364136, "step": 5640}, {"loss": 0.6385, "grad_norm": 0.8450375199317932, "learning_rate": 0.0002, "epoch": 4.143747708104144, "step": 5650}, {"loss": 0.6866, "grad_norm": 0.9006940126419067, "learning_rate": 0.0002, "epoch": 4.151081774844151, "step": 5660}, {"loss": 0.6179, "grad_norm": 0.9447154998779297, "learning_rate": 0.0002, "epoch": 4.158415841584159, "step": 5670}, {"loss": 0.6476, "grad_norm": 0.798032283782959, "learning_rate": 0.0002, "epoch": 4.165749908324166, "step": 5680}, {"loss": 0.6666, "grad_norm": 0.65578693151474, "learning_rate": 0.0002, "epoch": 4.1730839750641735, "step": 5690}, {"loss": 0.701, "grad_norm": 1.0864700078964233, "learning_rate": 0.0002, "epoch": 4.18041804180418, "step": 5700}, {"loss": 0.6895, "grad_norm": 0.7344121932983398, "learning_rate": 0.0002, "epoch": 4.187752108544188, "step": 5710}, {"loss": 0.6659, "grad_norm": 0.9722456932067871, "learning_rate": 0.0002, "epoch": 4.195086175284195, "step": 5720}, {"loss": 0.6887, "grad_norm": 1.263814926147461, "learning_rate": 0.0002, "epoch": 4.2024202420242025, "step": 5730}, {"loss": 0.608, "grad_norm": 0.9622581005096436, "learning_rate": 0.0002, "epoch": 4.20975430876421, "step": 5740}, {"loss": 0.6221, "grad_norm": 0.8497143387794495, "learning_rate": 0.0002, "epoch": 4.2170883755042174, "step": 5750}, {"loss": 0.6322, "grad_norm": 0.8248446583747864, "learning_rate": 0.0002, "epoch": 4.224422442244224, "step": 5760}, {"loss": 0.6045, "grad_norm": 1.2544798851013184, "learning_rate": 0.0002, "epoch": 4.2317565089842315, "step": 5770}, {"loss": 0.641, "grad_norm": 0.8224676251411438, "learning_rate": 0.0002, "epoch": 4.239090575724239, "step": 5780}, {"loss": 0.6399, "grad_norm": 0.8924877047538757, "learning_rate": 0.0002, "epoch": 4.2464246424642464, "step": 5790}, {"loss": 0.6845, "grad_norm": 0.8545848727226257, "learning_rate": 0.0002, "epoch": 4.253758709204254, "step": 5800}, {"loss": 0.6669, "grad_norm": 0.8081067800521851, "learning_rate": 0.0002, "epoch": 4.261092775944261, "step": 5810}, {"loss": 0.6149, "grad_norm": 0.7111002802848816, "learning_rate": 0.0002, "epoch": 4.268426842684269, "step": 5820}, {"loss": 0.6343, "grad_norm": 0.8696979880332947, "learning_rate": 0.0002, "epoch": 4.2757609094242754, "step": 5830}, {"loss": 0.6384, "grad_norm": 0.821401834487915, "learning_rate": 0.0002, "epoch": 4.283094976164283, "step": 5840}, {"loss": 0.6912, "grad_norm": 0.888908326625824, "learning_rate": 0.0002, "epoch": 4.29042904290429, "step": 5850}, {"loss": 0.6061, "grad_norm": 1.9380123615264893, "learning_rate": 0.0002, "epoch": 4.297763109644298, "step": 5860}, {"loss": 0.6766, "grad_norm": 1.121774435043335, "learning_rate": 0.0002, "epoch": 4.305097176384305, "step": 5870}, {"loss": 0.7205, "grad_norm": 0.9238282442092896, "learning_rate": 0.0002, "epoch": 4.312431243124313, "step": 5880}, {"loss": 0.6351, "grad_norm": 0.7321620583534241, "learning_rate": 0.0002, "epoch": 4.319765309864319, "step": 5890}, {"loss": 0.6404, "grad_norm": 0.8739548325538635, "learning_rate": 0.0002, "epoch": 4.327099376604327, "step": 5900}, {"loss": 0.5892, "grad_norm": 0.9686012268066406, "learning_rate": 0.0002, "epoch": 4.334433443344334, "step": 5910}, {"loss": 0.641, "grad_norm": 0.9033839106559753, "learning_rate": 0.0002, "epoch": 4.341767510084342, "step": 5920}, {"loss": 0.6456, "grad_norm": 0.8131115436553955, "learning_rate": 0.0002, "epoch": 4.349101576824349, "step": 5930}, {"loss": 0.5826, "grad_norm": 0.8942412734031677, "learning_rate": 0.0002, "epoch": 4.356435643564357, "step": 5940}, {"loss": 0.7336, "grad_norm": 0.8439112901687622, "learning_rate": 0.0002, "epoch": 4.363769710304364, "step": 5950}, {"loss": 0.6537, "grad_norm": 0.9176713228225708, "learning_rate": 0.0002, "epoch": 4.371103777044371, "step": 5960}, {"loss": 0.6792, "grad_norm": 0.6799634695053101, "learning_rate": 0.0002, "epoch": 4.378437843784378, "step": 5970}, {"loss": 0.7266, "grad_norm": 1.0435824394226074, "learning_rate": 0.0002, "epoch": 4.385771910524386, "step": 5980}, {"loss": 0.68, "grad_norm": 0.997937798500061, "learning_rate": 0.0002, "epoch": 4.393105977264393, "step": 5990}, {"loss": 0.6604, "grad_norm": 1.0308842658996582, "learning_rate": 0.0002, "epoch": 4.400440044004401, "step": 6000}, {"loss": 0.6402, "grad_norm": 1.3683775663375854, "learning_rate": 0.0002, "epoch": 4.407774110744408, "step": 6010}, {"loss": 0.7027, "grad_norm": 0.7569534182548523, "learning_rate": 0.0002, "epoch": 4.415108177484415, "step": 6020}, {"loss": 0.5949, "grad_norm": 1.089978575706482, "learning_rate": 0.0002, "epoch": 4.422442244224422, "step": 6030}, {"loss": 0.6353, "grad_norm": 0.7522459626197815, "learning_rate": 0.0002, "epoch": 4.42977631096443, "step": 6040}, {"loss": 0.5852, "grad_norm": 0.6709823608398438, "learning_rate": 0.0002, "epoch": 4.437110377704437, "step": 6050}, {"loss": 0.6718, "grad_norm": 0.6992089748382568, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6060}, {"loss": 0.6933, "grad_norm": 1.0182931423187256, "learning_rate": 0.0002, "epoch": 4.451778511184452, "step": 6070}, {"loss": 0.6255, "grad_norm": 1.0685160160064697, "learning_rate": 0.0002, "epoch": 4.459112577924459, "step": 6080}, {"loss": 0.6086, "grad_norm": 0.8295124769210815, "learning_rate": 0.0002, "epoch": 4.466446644664466, "step": 6090}, {"loss": 0.6359, "grad_norm": 1.1862998008728027, "learning_rate": 0.0002, "epoch": 4.473780711404474, "step": 6100}, {"loss": 0.638, "grad_norm": 0.7400273084640503, "learning_rate": 0.0002, "epoch": 4.481114778144481, "step": 6110}, {"loss": 0.6854, "grad_norm": 0.7098417282104492, "learning_rate": 0.0002, "epoch": 4.488448844884489, "step": 6120}, {"loss": 0.6976, "grad_norm": 0.9745053648948669, "learning_rate": 0.0002, "epoch": 4.495782911624496, "step": 6130}, {"loss": 0.605, "grad_norm": 0.8638797998428345, "learning_rate": 0.0002, "epoch": 4.503116978364503, "step": 6140}, {"loss": 0.6491, "grad_norm": 0.8291046619415283, "learning_rate": 0.0002, "epoch": 4.51045104510451, "step": 6150}, {"loss": 0.6457, "grad_norm": 1.0301737785339355, "learning_rate": 0.0002, "epoch": 4.517785111844518, "step": 6160}, {"loss": 0.6742, "grad_norm": 1.1996512413024902, "learning_rate": 0.0002, "epoch": 4.525119178584525, "step": 6170}, {"loss": 0.6484, "grad_norm": 1.151038408279419, "learning_rate": 0.0002, "epoch": 4.5324532453245325, "step": 6180}, {"loss": 0.668, "grad_norm": 0.8385201096534729, "learning_rate": 0.0002, "epoch": 4.53978731206454, "step": 6190}, {"loss": 0.6381, "grad_norm": 0.8969188332557678, "learning_rate": 0.0002, "epoch": 4.5471213788045475, "step": 6200}, {"loss": 0.7141, "grad_norm": 1.60659658908844, "learning_rate": 0.0002, "epoch": 4.554455445544555, "step": 6210}, {"loss": 0.6388, "grad_norm": 0.9356731176376343, "learning_rate": 0.0002, "epoch": 4.5617895122845615, "step": 6220}, {"loss": 0.7393, "grad_norm": 0.95856773853302, "learning_rate": 0.0002, "epoch": 4.569123579024569, "step": 6230}, {"loss": 0.6554, "grad_norm": 1.1162524223327637, "learning_rate": 0.0002, "epoch": 4.5764576457645765, "step": 6240}, {"loss": 0.6012, "grad_norm": 0.8809238076210022, "learning_rate": 0.0002, "epoch": 4.583791712504584, "step": 6250}, {"loss": 0.648, "grad_norm": 0.890738844871521, "learning_rate": 0.0002, "epoch": 4.591125779244591, "step": 6260}, {"loss": 0.6663, "grad_norm": 0.918684720993042, "learning_rate": 0.0002, "epoch": 4.598459845984598, "step": 6270}, {"loss": 0.5992, "grad_norm": 0.8156296610832214, "learning_rate": 0.0002, "epoch": 4.6057939127246055, "step": 6280}, {"loss": 0.723, "grad_norm": 1.046634316444397, "learning_rate": 0.0002, "epoch": 4.613127979464613, "step": 6290}, {"loss": 0.7023, "grad_norm": 0.7725525498390198, "learning_rate": 0.0002, "epoch": 4.62046204620462, "step": 6300}, {"loss": 0.6414, "grad_norm": 0.9992046356201172, "learning_rate": 0.0002, "epoch": 4.627796112944628, "step": 6310}, {"loss": 0.6201, "grad_norm": 0.8480095267295837, "learning_rate": 0.0002, "epoch": 4.635130179684635, "step": 6320}, {"loss": 0.6869, "grad_norm": 0.7061955332756042, "learning_rate": 0.0002, "epoch": 4.642464246424643, "step": 6330}, {"loss": 0.6828, "grad_norm": 1.0354212522506714, "learning_rate": 0.0002, "epoch": 4.649798313164649, "step": 6340}, {"loss": 0.6651, "grad_norm": 1.0081377029418945, "learning_rate": 0.0002, "epoch": 4.657132379904657, "step": 6350}, {"loss": 0.726, "grad_norm": 1.2904249429702759, "learning_rate": 0.0002, "epoch": 4.664466446644664, "step": 6360}, {"loss": 0.7148, "grad_norm": 0.9248910546302795, "learning_rate": 0.0002, "epoch": 4.671800513384672, "step": 6370}, {"loss": 0.6961, "grad_norm": 0.9907804131507874, "learning_rate": 0.0002, "epoch": 4.679134580124679, "step": 6380}, {"loss": 0.6163, "grad_norm": 1.201143741607666, "learning_rate": 0.0002, "epoch": 4.686468646864687, "step": 6390}, {"loss": 0.6762, "grad_norm": 0.8709394335746765, "learning_rate": 0.0002, "epoch": 4.693802713604693, "step": 6400}, {"loss": 0.7217, "grad_norm": 0.7468608021736145, "learning_rate": 0.0002, "epoch": 4.701136780344701, "step": 6410}, {"loss": 0.6548, "grad_norm": 0.8607903718948364, "learning_rate": 0.0002, "epoch": 4.708470847084708, "step": 6420}, {"loss": 0.6449, "grad_norm": 0.9840512871742249, "learning_rate": 0.0002, "epoch": 4.715804913824716, "step": 6430}, {"loss": 0.685, "grad_norm": 0.8328204154968262, "learning_rate": 0.0002, "epoch": 4.723138980564723, "step": 6440}, {"loss": 0.697, "grad_norm": 0.924505352973938, "learning_rate": 0.0002, "epoch": 4.730473047304731, "step": 6450}, {"loss": 0.7422, "grad_norm": 0.8897685408592224, "learning_rate": 0.0002, "epoch": 4.737807114044738, "step": 6460}, {"loss": 0.6842, "grad_norm": 0.9605024456977844, "learning_rate": 0.0002, "epoch": 4.745141180784745, "step": 6470}, {"loss": 0.6488, "grad_norm": 0.8150759935379028, "learning_rate": 0.0002, "epoch": 4.752475247524752, "step": 6480}, {"loss": 0.6606, "grad_norm": 0.8128412961959839, "learning_rate": 0.0002, "epoch": 4.75980931426476, "step": 6490}, {"loss": 0.6729, "grad_norm": 0.7381404638290405, "learning_rate": 0.0002, "epoch": 4.767143381004767, "step": 6500}, {"loss": 0.6713, "grad_norm": 1.0565853118896484, "learning_rate": 0.0002, "epoch": 4.774477447744775, "step": 6510}, {"loss": 0.6496, "grad_norm": 0.9298134446144104, "learning_rate": 0.0002, "epoch": 4.781811514484782, "step": 6520}, {"loss": 0.7279, "grad_norm": 1.0145525932312012, "learning_rate": 0.0002, "epoch": 4.789145581224789, "step": 6530}, {"loss": 0.5986, "grad_norm": 0.92259681224823, "learning_rate": 0.0002, "epoch": 4.796479647964796, "step": 6540}, {"loss": 0.63, "grad_norm": 0.7881024479866028, "learning_rate": 0.0002, "epoch": 4.803813714704804, "step": 6550}, {"loss": 0.7134, "grad_norm": 1.4935206174850464, "learning_rate": 0.0002, "epoch": 4.811147781444811, "step": 6560}, {"loss": 0.6695, "grad_norm": 0.8612369298934937, "learning_rate": 0.0002, "epoch": 4.818481848184819, "step": 6570}, {"loss": 0.779, "grad_norm": 1.0118653774261475, "learning_rate": 0.0002, "epoch": 4.825815914924826, "step": 6580}, {"loss": 0.6991, "grad_norm": 1.1303809881210327, "learning_rate": 0.0002, "epoch": 4.833149981664834, "step": 6590}, {"loss": 0.7887, "grad_norm": 0.9112492203712463, "learning_rate": 0.0002, "epoch": 4.84048404840484, "step": 6600}, {"loss": 0.7699, "grad_norm": 0.864762544631958, "learning_rate": 0.0002, "epoch": 4.847818115144848, "step": 6610}, {"loss": 0.7347, "grad_norm": 0.9090572595596313, "learning_rate": 0.0002, "epoch": 4.855152181884855, "step": 6620}, {"loss": 0.6608, "grad_norm": 1.014953374862671, "learning_rate": 0.0002, "epoch": 4.862486248624863, "step": 6630}, {"loss": 0.6429, "grad_norm": 1.0702149868011475, "learning_rate": 0.0002, "epoch": 4.86982031536487, "step": 6640}, {"loss": 0.6943, "grad_norm": 1.002135157585144, "learning_rate": 0.0002, "epoch": 4.8771543821048775, "step": 6650}, {"loss": 0.7225, "grad_norm": 0.862545907497406, "learning_rate": 0.0002, "epoch": 4.884488448844884, "step": 6660}, {"loss": 0.6206, "grad_norm": 0.7302131056785583, "learning_rate": 0.0002, "epoch": 4.891822515584892, "step": 6670}, {"loss": 0.7175, "grad_norm": 0.8380730152130127, "learning_rate": 0.0002, "epoch": 4.899156582324899, "step": 6680}, {"loss": 0.645, "grad_norm": 0.7956018447875977, "learning_rate": 0.0002, "epoch": 4.9064906490649065, "step": 6690}, {"loss": 0.6431, "grad_norm": 0.6717583537101746, "learning_rate": 0.0002, "epoch": 4.913824715804914, "step": 6700}, {"loss": 0.6942, "grad_norm": 1.09099280834198, "learning_rate": 0.0002, "epoch": 4.9211587825449215, "step": 6710}, {"loss": 0.7533, "grad_norm": 0.8589889407157898, "learning_rate": 0.0002, "epoch": 4.928492849284929, "step": 6720}, {"loss": 0.66, "grad_norm": 1.0046314001083374, "learning_rate": 0.0002, "epoch": 4.9358269160249355, "step": 6730}, {"loss": 0.6864, "grad_norm": 0.8559659123420715, "learning_rate": 0.0002, "epoch": 4.943160982764943, "step": 6740}, {"loss": 0.6847, "grad_norm": 0.8588525652885437, "learning_rate": 0.0002, "epoch": 4.9504950495049505, "step": 6750}, {"loss": 0.6428, "grad_norm": 0.9192708134651184, "learning_rate": 0.0002, "epoch": 4.957829116244958, "step": 6760}, {"loss": 0.6873, "grad_norm": 1.051398754119873, "learning_rate": 0.0002, "epoch": 4.965163182984965, "step": 6770}, {"loss": 0.7249, "grad_norm": 0.9111362099647522, "learning_rate": 0.0002, "epoch": 4.972497249724973, "step": 6780}, {"loss": 0.7613, "grad_norm": 0.7305638194084167, "learning_rate": 0.0002, "epoch": 4.9798313164649795, "step": 6790}, {"loss": 0.6747, "grad_norm": 1.118837594985962, "learning_rate": 0.0002, "epoch": 4.987165383204987, "step": 6800}, {"loss": 0.6412, "grad_norm": 0.9075239300727844, "learning_rate": 0.0002, "epoch": 4.994499449944994, "step": 6810}, {"eval_loss": 1.2361247539520264, "eval_runtime": 32.7325, "eval_samples_per_second": 13.167, "eval_steps_per_second": 1.65, "epoch": 4.999633296662999, "step": 6817}, {"loss": 0.7091, "grad_norm": 1.0541315078735352, "learning_rate": 0.0002, "epoch": 5.001833516685002, "step": 6820}, {"loss": 0.4882, "grad_norm": 0.9750140905380249, "learning_rate": 0.0002, "epoch": 5.009167583425009, "step": 6830}, {"loss": 0.6022, "grad_norm": 0.931838870048523, "learning_rate": 0.0002, "epoch": 5.016501650165017, "step": 6840}, {"loss": 0.5194, "grad_norm": 1.110278844833374, "learning_rate": 0.0002, "epoch": 5.023835716905023, "step": 6850}, {"loss": 0.4676, "grad_norm": 1.0670180320739746, "learning_rate": 0.0002, "epoch": 5.031169783645031, "step": 6860}, {"loss": 0.4374, "grad_norm": 0.8762092590332031, "learning_rate": 0.0002, "epoch": 5.038503850385038, "step": 6870}, {"loss": 0.505, "grad_norm": 1.1169432401657104, "learning_rate": 0.0002, "epoch": 5.045837917125046, "step": 6880}, {"loss": 0.5114, "grad_norm": 1.005491018295288, "learning_rate": 0.0002, "epoch": 5.053171983865053, "step": 6890}, {"loss": 0.5221, "grad_norm": 1.1751841306686401, "learning_rate": 0.0002, "epoch": 5.060506050605061, "step": 6900}, {"loss": 0.451, "grad_norm": 0.8501367568969727, "learning_rate": 0.0002, "epoch": 5.067840117345068, "step": 6910}, {"loss": 0.5292, "grad_norm": 0.9795131683349609, "learning_rate": 0.0002, "epoch": 5.075174184085075, "step": 6920}, {"loss": 0.5234, "grad_norm": 0.8929879665374756, "learning_rate": 0.0002, "epoch": 5.082508250825082, "step": 6930}, {"loss": 0.5378, "grad_norm": 1.0156651735305786, "learning_rate": 0.0002, "epoch": 5.08984231756509, "step": 6940}, {"loss": 0.5241, "grad_norm": 1.0974335670471191, "learning_rate": 0.0002, "epoch": 5.097176384305097, "step": 6950}, {"loss": 0.5705, "grad_norm": 1.7015666961669922, "learning_rate": 0.0002, "epoch": 5.104510451045105, "step": 6960}, {"loss": 0.523, "grad_norm": 1.0343226194381714, "learning_rate": 0.0002, "epoch": 5.111844517785112, "step": 6970}, {"loss": 0.4616, "grad_norm": 1.3072983026504517, "learning_rate": 0.0002, "epoch": 5.119178584525119, "step": 6980}, {"loss": 0.4813, "grad_norm": 1.038986086845398, "learning_rate": 0.0002, "epoch": 5.126512651265126, "step": 6990}, {"loss": 0.4616, "grad_norm": 0.8638386130332947, "learning_rate": 0.0002, "epoch": 5.133846718005134, "step": 7000}, {"loss": 0.5294, "grad_norm": 0.8326523900032043, "learning_rate": 0.0002, "epoch": 5.141180784745141, "step": 7010}, {"loss": 0.5021, "grad_norm": 1.0976895093917847, "learning_rate": 0.0002, "epoch": 5.148514851485149, "step": 7020}, {"loss": 0.4677, "grad_norm": 1.0077873468399048, "learning_rate": 0.0002, "epoch": 5.155848918225156, "step": 7030}, {"loss": 0.5262, "grad_norm": 1.0662257671356201, "learning_rate": 0.0002, "epoch": 5.163182984965164, "step": 7040}, {"loss": 0.5484, "grad_norm": 1.206271767616272, "learning_rate": 0.0002, "epoch": 5.17051705170517, "step": 7050}, {"loss": 0.4817, "grad_norm": 1.1990262269973755, "learning_rate": 0.0002, "epoch": 5.177851118445178, "step": 7060}, {"loss": 0.6048, "grad_norm": 1.0207163095474243, "learning_rate": 0.0002, "epoch": 5.185185185185185, "step": 7070}, {"loss": 0.4816, "grad_norm": 1.2783987522125244, "learning_rate": 0.0002, "epoch": 5.192519251925193, "step": 7080}, {"loss": 0.5322, "grad_norm": 1.1592512130737305, "learning_rate": 0.0002, "epoch": 5.1998533186652, "step": 7090}, {"loss": 0.5472, "grad_norm": 1.1053160429000854, "learning_rate": 0.0002, "epoch": 5.2071873854052075, "step": 7100}, {"loss": 0.4986, "grad_norm": 1.1925510168075562, "learning_rate": 0.0002, "epoch": 5.214521452145214, "step": 7110}, {"loss": 0.5065, "grad_norm": 1.0714877843856812, "learning_rate": 0.0002, "epoch": 5.221855518885222, "step": 7120}, {"loss": 0.5209, "grad_norm": 0.9451011419296265, "learning_rate": 0.0002, "epoch": 5.229189585625229, "step": 7130}, {"loss": 0.5298, "grad_norm": 1.03838050365448, "learning_rate": 0.0002, "epoch": 5.2365236523652365, "step": 7140}, {"loss": 0.4848, "grad_norm": 0.9204146265983582, "learning_rate": 0.0002, "epoch": 5.243857719105244, "step": 7150}, {"loss": 0.5164, "grad_norm": 1.0142229795455933, "learning_rate": 0.0002, "epoch": 5.2511917858452515, "step": 7160}, {"loss": 0.5092, "grad_norm": 1.4432005882263184, "learning_rate": 0.0002, "epoch": 5.258525852585258, "step": 7170}, {"loss": 0.5133, "grad_norm": 1.1239633560180664, "learning_rate": 0.0002, "epoch": 5.2658599193252655, "step": 7180}, {"loss": 0.4969, "grad_norm": 0.7012821435928345, "learning_rate": 0.0002, "epoch": 5.273193986065273, "step": 7190}, {"loss": 0.5466, "grad_norm": 1.3499128818511963, "learning_rate": 0.0002, "epoch": 5.2805280528052805, "step": 7200}, {"loss": 0.5282, "grad_norm": 0.9498730897903442, "learning_rate": 0.0002, "epoch": 5.287862119545288, "step": 7210}, {"loss": 0.5051, "grad_norm": 0.9552369117736816, "learning_rate": 0.0002, "epoch": 5.295196186285295, "step": 7220}, {"loss": 0.5329, "grad_norm": 0.7610348463058472, "learning_rate": 0.0002, "epoch": 5.302530253025303, "step": 7230}, {"loss": 0.468, "grad_norm": 1.0314512252807617, "learning_rate": 0.0002, "epoch": 5.3098643197653095, "step": 7240}, {"loss": 0.5367, "grad_norm": 1.0534334182739258, "learning_rate": 0.0002, "epoch": 5.317198386505317, "step": 7250}, {"loss": 0.5491, "grad_norm": 1.2553406953811646, "learning_rate": 0.0002, "epoch": 5.324532453245324, "step": 7260}, {"loss": 0.5218, "grad_norm": 0.7061691880226135, "learning_rate": 0.0002, "epoch": 5.331866519985332, "step": 7270}, {"loss": 0.5625, "grad_norm": 0.9652578830718994, "learning_rate": 0.0002, "epoch": 5.339200586725339, "step": 7280}, {"loss": 0.5608, "grad_norm": 1.114788293838501, "learning_rate": 0.0002, "epoch": 5.346534653465347, "step": 7290}, {"loss": 0.578, "grad_norm": 1.0940049886703491, "learning_rate": 0.0002, "epoch": 5.353868720205353, "step": 7300}, {"loss": 0.5256, "grad_norm": 1.0151008367538452, "learning_rate": 0.0002, "epoch": 5.361202786945361, "step": 7310}, {"loss": 0.5377, "grad_norm": 1.0369552373886108, "learning_rate": 0.0002, "epoch": 5.368536853685368, "step": 7320}, {"loss": 0.5028, "grad_norm": 0.8489866256713867, "learning_rate": 0.0002, "epoch": 5.375870920425376, "step": 7330}, {"loss": 0.5937, "grad_norm": 1.1031713485717773, "learning_rate": 0.0002, "epoch": 5.383204987165383, "step": 7340}, {"loss": 0.5355, "grad_norm": 0.9094716310501099, "learning_rate": 0.0002, "epoch": 5.390539053905391, "step": 7350}, {"loss": 0.5406, "grad_norm": 0.9530431032180786, "learning_rate": 0.0002, "epoch": 5.397873120645398, "step": 7360}, {"loss": 0.529, "grad_norm": 0.9633604884147644, "learning_rate": 0.0002, "epoch": 5.405207187385405, "step": 7370}, {"loss": 0.5315, "grad_norm": 0.9541662335395813, "learning_rate": 0.0002, "epoch": 5.412541254125412, "step": 7380}, {"loss": 0.6774, "grad_norm": 1.0459771156311035, "learning_rate": 0.0002, "epoch": 5.41987532086542, "step": 7390}, {"loss": 0.5737, "grad_norm": 1.027388334274292, "learning_rate": 0.0002, "epoch": 5.427209387605427, "step": 7400}, {"loss": 0.556, "grad_norm": 0.7267653346061707, "learning_rate": 0.0002, "epoch": 5.434543454345435, "step": 7410}, {"loss": 0.4581, "grad_norm": 1.020142674446106, "learning_rate": 0.0002, "epoch": 5.441877521085442, "step": 7420}, {"loss": 0.4853, "grad_norm": 1.044754147529602, "learning_rate": 0.0002, "epoch": 5.449211587825449, "step": 7430}, {"loss": 0.5666, "grad_norm": 1.5476195812225342, "learning_rate": 0.0002, "epoch": 5.456545654565456, "step": 7440}, {"loss": 0.5302, "grad_norm": 0.9879506826400757, "learning_rate": 0.0002, "epoch": 5.463879721305464, "step": 7450}, {"loss": 0.591, "grad_norm": 1.2562980651855469, "learning_rate": 0.0002, "epoch": 5.471213788045471, "step": 7460}, {"loss": 0.5188, "grad_norm": 1.3051384687423706, "learning_rate": 0.0002, "epoch": 5.478547854785479, "step": 7470}, {"loss": 0.5658, "grad_norm": 1.0511597394943237, "learning_rate": 0.0002, "epoch": 5.485881921525486, "step": 7480}, {"loss": 0.6327, "grad_norm": 1.0380817651748657, "learning_rate": 0.0002, "epoch": 5.493215988265494, "step": 7490}, {"loss": 0.5356, "grad_norm": 1.170274257659912, "learning_rate": 0.0002, "epoch": 5.5005500550055, "step": 7500}, {"loss": 0.5405, "grad_norm": 1.3356517553329468, "learning_rate": 0.0002, "epoch": 5.507884121745508, "step": 7510}, {"loss": 0.5305, "grad_norm": 1.0727124214172363, "learning_rate": 0.0002, "epoch": 5.515218188485515, "step": 7520}, {"loss": 0.5543, "grad_norm": 1.0110199451446533, "learning_rate": 0.0002, "epoch": 5.522552255225523, "step": 7530}, {"loss": 0.5962, "grad_norm": 1.3086743354797363, "learning_rate": 0.0002, "epoch": 5.52988632196553, "step": 7540}, {"loss": 0.5512, "grad_norm": 1.1904916763305664, "learning_rate": 0.0002, "epoch": 5.537220388705538, "step": 7550}, {"loss": 0.5915, "grad_norm": 0.9466280937194824, "learning_rate": 0.0002, "epoch": 5.544554455445544, "step": 7560}, {"loss": 0.5573, "grad_norm": 1.1237901449203491, "learning_rate": 0.0002, "epoch": 5.551888522185552, "step": 7570}, {"loss": 0.5383, "grad_norm": 0.9590660333633423, "learning_rate": 0.0002, "epoch": 5.559222588925559, "step": 7580}, {"loss": 0.5594, "grad_norm": 1.0890778303146362, "learning_rate": 0.0002, "epoch": 5.566556655665567, "step": 7590}, {"loss": 0.5698, "grad_norm": 0.7206931114196777, "learning_rate": 0.0002, "epoch": 5.573890722405574, "step": 7600}, {"loss": 0.5511, "grad_norm": 1.2884514331817627, "learning_rate": 0.0002, "epoch": 5.5812247891455815, "step": 7610}, {"loss": 0.5279, "grad_norm": 0.7798039317131042, "learning_rate": 0.0002, "epoch": 5.588558855885589, "step": 7620}, {"loss": 0.4847, "grad_norm": 1.166046142578125, "learning_rate": 0.0002, "epoch": 5.595892922625596, "step": 7630}, {"loss": 0.5821, "grad_norm": 1.0150201320648193, "learning_rate": 0.0002, "epoch": 5.603226989365603, "step": 7640}, {"loss": 0.5296, "grad_norm": 1.0449682474136353, "learning_rate": 0.0002, "epoch": 5.6105610561056105, "step": 7650}, {"loss": 0.5431, "grad_norm": 0.9310530424118042, "learning_rate": 0.0002, "epoch": 5.617895122845618, "step": 7660}, {"loss": 0.5234, "grad_norm": 0.9117933511734009, "learning_rate": 0.0002, "epoch": 5.6252291895856255, "step": 7670}, {"loss": 0.5807, "grad_norm": 1.1475164890289307, "learning_rate": 0.0002, "epoch": 5.632563256325633, "step": 7680}, {"loss": 0.5816, "grad_norm": 1.066809058189392, "learning_rate": 0.0002, "epoch": 5.6398973230656395, "step": 7690}, {"loss": 0.551, "grad_norm": 1.2834991216659546, "learning_rate": 0.0002, "epoch": 5.647231389805647, "step": 7700}, {"loss": 0.5914, "grad_norm": 1.2245112657546997, "learning_rate": 0.0002, "epoch": 5.6545654565456545, "step": 7710}, {"loss": 0.5552, "grad_norm": 1.1424106359481812, "learning_rate": 0.0002, "epoch": 5.661899523285662, "step": 7720}, {"loss": 0.559, "grad_norm": 1.0673892498016357, "learning_rate": 0.0002, "epoch": 5.669233590025669, "step": 7730}, {"loss": 0.544, "grad_norm": 1.4312121868133545, "learning_rate": 0.0002, "epoch": 5.676567656765677, "step": 7740}, {"loss": 0.5576, "grad_norm": 0.9976982474327087, "learning_rate": 0.0002, "epoch": 5.683901723505684, "step": 7750}, {"loss": 0.4855, "grad_norm": 0.9464678168296814, "learning_rate": 0.0002, "epoch": 5.691235790245691, "step": 7760}, {"loss": 0.5363, "grad_norm": 1.010995626449585, "learning_rate": 0.0002, "epoch": 5.698569856985698, "step": 7770}, {"loss": 0.5873, "grad_norm": 1.3787750005722046, "learning_rate": 0.0002, "epoch": 5.705903923725706, "step": 7780}, {"loss": 0.6234, "grad_norm": 1.020922303199768, "learning_rate": 0.0002, "epoch": 5.713237990465713, "step": 7790}, {"loss": 0.5337, "grad_norm": 0.9748636484146118, "learning_rate": 0.0002, "epoch": 5.720572057205721, "step": 7800}, {"loss": 0.5507, "grad_norm": 1.3077744245529175, "learning_rate": 0.0002, "epoch": 5.727906123945728, "step": 7810}, {"loss": 0.558, "grad_norm": 1.4770057201385498, "learning_rate": 0.0002, "epoch": 5.735240190685735, "step": 7820}, {"loss": 0.5571, "grad_norm": 1.6349090337753296, "learning_rate": 0.0002, "epoch": 5.742574257425742, "step": 7830}, {"loss": 0.5056, "grad_norm": 0.9818630814552307, "learning_rate": 0.0002, "epoch": 5.74990832416575, "step": 7840}, {"loss": 0.5495, "grad_norm": 0.9659715890884399, "learning_rate": 0.0002, "epoch": 5.757242390905757, "step": 7850}, {"loss": 0.5628, "grad_norm": 0.9269950985908508, "learning_rate": 0.0002, "epoch": 5.764576457645765, "step": 7860}, {"loss": 0.5594, "grad_norm": 1.0099073648452759, "learning_rate": 0.0002, "epoch": 5.771910524385772, "step": 7870}, {"loss": 0.5912, "grad_norm": 0.9123615026473999, "learning_rate": 0.0002, "epoch": 5.77924459112578, "step": 7880}, {"loss": 0.6054, "grad_norm": 1.1542246341705322, "learning_rate": 0.0002, "epoch": 5.786578657865786, "step": 7890}, {"loss": 0.5829, "grad_norm": 1.0792022943496704, "learning_rate": 0.0002, "epoch": 5.793912724605794, "step": 7900}, {"loss": 0.504, "grad_norm": 0.95615553855896, "learning_rate": 0.0002, "epoch": 5.801246791345801, "step": 7910}, {"loss": 0.5918, "grad_norm": 1.2471332550048828, "learning_rate": 0.0002, "epoch": 5.808580858085809, "step": 7920}, {"loss": 0.5719, "grad_norm": 1.0189851522445679, "learning_rate": 0.0002, "epoch": 5.815914924825816, "step": 7930}, {"loss": 0.5958, "grad_norm": 1.3309742212295532, "learning_rate": 0.0002, "epoch": 5.823248991565823, "step": 7940}, {"loss": 0.6255, "grad_norm": 1.2930549383163452, "learning_rate": 0.0002, "epoch": 5.83058305830583, "step": 7950}, {"loss": 0.5301, "grad_norm": 0.8216308951377869, "learning_rate": 0.0002, "epoch": 5.837917125045838, "step": 7960}, {"loss": 0.5397, "grad_norm": 1.1205775737762451, "learning_rate": 0.0002, "epoch": 5.845251191785845, "step": 7970}, {"loss": 0.5903, "grad_norm": 0.851298451423645, "learning_rate": 0.0002, "epoch": 5.852585258525853, "step": 7980}, {"loss": 0.5981, "grad_norm": 0.8797095417976379, "learning_rate": 0.0002, "epoch": 5.85991932526586, "step": 7990}, {"loss": 0.6106, "grad_norm": 1.5784614086151123, "learning_rate": 0.0002, "epoch": 5.867253392005868, "step": 8000}, {"loss": 0.5956, "grad_norm": 1.1531187295913696, "learning_rate": 0.0002, "epoch": 5.874587458745875, "step": 8010}, {"loss": 0.6289, "grad_norm": 1.2469146251678467, "learning_rate": 0.0002, "epoch": 5.881921525485882, "step": 8020}, {"loss": 0.5827, "grad_norm": 1.0784350633621216, "learning_rate": 0.0002, "epoch": 5.889255592225889, "step": 8030}, {"loss": 0.6339, "grad_norm": 1.1311599016189575, "learning_rate": 0.0002, "epoch": 5.896589658965897, "step": 8040}, {"loss": 0.5815, "grad_norm": 0.9654512405395508, "learning_rate": 0.0002, "epoch": 5.903923725705904, "step": 8050}, {"loss": 0.6198, "grad_norm": 1.3288270235061646, "learning_rate": 0.0002, "epoch": 5.9112577924459115, "step": 8060}, {"loss": 0.6515, "grad_norm": 1.12800931930542, "learning_rate": 0.0002, "epoch": 5.918591859185918, "step": 8070}, {"loss": 0.5684, "grad_norm": 0.9449917674064636, "learning_rate": 0.0002, "epoch": 5.925925925925926, "step": 8080}, {"loss": 0.6063, "grad_norm": 1.1532357931137085, "learning_rate": 0.0002, "epoch": 5.933259992665933, "step": 8090}, {"loss": 0.5318, "grad_norm": 1.2211151123046875, "learning_rate": 0.0002, "epoch": 5.9405940594059405, "step": 8100}, {"loss": 0.6512, "grad_norm": 1.3459105491638184, "learning_rate": 0.0002, "epoch": 5.947928126145948, "step": 8110}, {"loss": 0.5952, "grad_norm": 1.251999855041504, "learning_rate": 0.0002, "epoch": 5.9552621928859555, "step": 8120}, {"loss": 0.6203, "grad_norm": 1.5682506561279297, "learning_rate": 0.0002, "epoch": 5.962596259625963, "step": 8130}, {"loss": 0.6253, "grad_norm": 0.926075279712677, "learning_rate": 0.0002, "epoch": 5.9699303263659695, "step": 8140}, {"loss": 0.5545, "grad_norm": 0.9622511863708496, "learning_rate": 0.0002, "epoch": 5.977264393105977, "step": 8150}, {"loss": 0.5518, "grad_norm": 0.9633373618125916, "learning_rate": 0.0002, "epoch": 5.9845984598459845, "step": 8160}, {"loss": 0.5831, "grad_norm": 0.8960476517677307, "learning_rate": 0.0002, "epoch": 5.991932526585992, "step": 8170}, {"loss": 0.5442, "grad_norm": 0.9372805953025818, "learning_rate": 0.0002, "epoch": 5.999266593325999, "step": 8180}]} +{"epoch": 6.999633296662999, "step": 9544, "epoch_duration": 1554.9810886383057, "total_accumulated_duration": 10403.232823848724, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9722, "grad_norm": 0.47521963715553284, "learning_rate": 0.0002, "epoch": 0.007334066740007334, "step": 10}, {"loss": 1.4821, "grad_norm": 0.5395162105560303, "learning_rate": 0.0002, "epoch": 0.014668133480014669, "step": 20}, {"loss": 1.4202, "grad_norm": 0.4305780231952667, "learning_rate": 0.0002, "epoch": 0.022002200220022004, "step": 30}, {"loss": 1.4271, "grad_norm": 0.6938246488571167, "learning_rate": 0.0002, "epoch": 0.029336266960029337, "step": 40}, {"loss": 1.3112, "grad_norm": 1.5133819580078125, "learning_rate": 0.0002, "epoch": 0.03667033370003667, "step": 50}, {"loss": 1.3132, "grad_norm": 0.9173883199691772, "learning_rate": 0.0002, "epoch": 0.04400440044004401, "step": 60}, {"loss": 1.2844, "grad_norm": 0.4619861841201782, "learning_rate": 0.0002, "epoch": 0.05133846718005134, "step": 70}, {"loss": 1.2108, "grad_norm": 0.46118637919425964, "learning_rate": 0.0002, "epoch": 0.058672533920058674, "step": 80}, {"loss": 1.3441, "grad_norm": 0.4468648135662079, "learning_rate": 0.0002, "epoch": 0.066006600660066, "step": 90}, {"loss": 1.1863, "grad_norm": 0.46123769879341125, "learning_rate": 0.0002, "epoch": 0.07334066740007333, "step": 100}, {"loss": 1.2772, "grad_norm": 0.4859139025211334, "learning_rate": 0.0002, "epoch": 0.08067473414008068, "step": 110}, {"loss": 1.2087, "grad_norm": 0.4384922385215759, "learning_rate": 0.0002, "epoch": 0.08800880088008801, "step": 120}, {"loss": 1.2927, "grad_norm": 0.39519360661506653, "learning_rate": 0.0002, "epoch": 0.09534286762009535, "step": 130}, {"loss": 1.2349, "grad_norm": 0.4049859344959259, "learning_rate": 0.0002, "epoch": 0.10267693436010268, "step": 140}, {"loss": 1.293, "grad_norm": 0.4605638086795807, "learning_rate": 0.0002, "epoch": 0.11001100110011001, "step": 150}, {"loss": 1.2659, "grad_norm": 0.4201928377151489, "learning_rate": 0.0002, "epoch": 0.11734506784011735, "step": 160}, {"loss": 1.3961, "grad_norm": 0.5367777347564697, "learning_rate": 0.0002, "epoch": 0.12467913458012468, "step": 170}, {"loss": 1.2481, "grad_norm": 0.41752299666404724, "learning_rate": 0.0002, "epoch": 0.132013201320132, "step": 180}, {"loss": 1.207, "grad_norm": 0.31597763299942017, "learning_rate": 0.0002, "epoch": 0.13934726806013933, "step": 190}, {"loss": 1.2441, "grad_norm": 0.7468788623809814, "learning_rate": 0.0002, "epoch": 0.14668133480014667, "step": 200}, {"loss": 1.199, "grad_norm": 0.3403034508228302, "learning_rate": 0.0002, "epoch": 0.15401540154015403, "step": 210}, {"loss": 1.2439, "grad_norm": 0.34240293502807617, "learning_rate": 0.0002, "epoch": 0.16134946828016136, "step": 220}, {"loss": 1.2022, "grad_norm": 0.356158971786499, "learning_rate": 0.0002, "epoch": 0.1686835350201687, "step": 230}, {"loss": 1.207, "grad_norm": 0.3448857367038727, "learning_rate": 0.0002, "epoch": 0.17601760176017603, "step": 240}, {"loss": 1.2156, "grad_norm": 0.3475699722766876, "learning_rate": 0.0002, "epoch": 0.18335166850018336, "step": 250}, {"loss": 1.1551, "grad_norm": 0.2770358622074127, "learning_rate": 0.0002, "epoch": 0.1906857352401907, "step": 260}, {"loss": 1.2238, "grad_norm": 0.4310270845890045, "learning_rate": 0.0002, "epoch": 0.19801980198019803, "step": 270}, {"loss": 1.2917, "grad_norm": 0.335041880607605, "learning_rate": 0.0002, "epoch": 0.20535386872020536, "step": 280}, {"loss": 1.0959, "grad_norm": 0.3420602083206177, "learning_rate": 0.0002, "epoch": 0.2126879354602127, "step": 290}, {"loss": 1.1232, "grad_norm": 0.325001060962677, "learning_rate": 0.0002, "epoch": 0.22002200220022003, "step": 300}, {"loss": 1.2007, "grad_norm": 0.3027827739715576, "learning_rate": 0.0002, "epoch": 0.22735606894022736, "step": 310}, {"loss": 1.1803, "grad_norm": 0.435550719499588, "learning_rate": 0.0002, "epoch": 0.2346901356802347, "step": 320}, {"loss": 1.2045, "grad_norm": 0.3884522616863251, "learning_rate": 0.0002, "epoch": 0.24202420242024203, "step": 330}, {"loss": 1.2481, "grad_norm": 0.7736002206802368, "learning_rate": 0.0002, "epoch": 0.24935826916024936, "step": 340}, {"loss": 1.3606, "grad_norm": 0.35052821040153503, "learning_rate": 0.0002, "epoch": 0.2566923359002567, "step": 350}, {"loss": 1.2129, "grad_norm": 0.3311890959739685, "learning_rate": 0.0002, "epoch": 0.264026402640264, "step": 360}, {"loss": 1.2219, "grad_norm": 0.7473500370979309, "learning_rate": 0.0002, "epoch": 0.27136046938027136, "step": 370}, {"loss": 1.2712, "grad_norm": 0.3681875765323639, "learning_rate": 0.0002, "epoch": 0.27869453612027867, "step": 380}, {"loss": 1.2258, "grad_norm": 0.3764737844467163, "learning_rate": 0.0002, "epoch": 0.28602860286028603, "step": 390}, {"loss": 1.1917, "grad_norm": 0.4243989586830139, "learning_rate": 0.0002, "epoch": 0.29336266960029334, "step": 400}, {"loss": 1.199, "grad_norm": 0.2658531963825226, "learning_rate": 0.0002, "epoch": 0.3006967363403007, "step": 410}, {"loss": 1.1622, "grad_norm": 0.3436793386936188, "learning_rate": 0.0002, "epoch": 0.30803080308030806, "step": 420}, {"loss": 1.2953, "grad_norm": 0.5101129412651062, "learning_rate": 0.0002, "epoch": 0.31536486982031536, "step": 430}, {"loss": 1.1557, "grad_norm": 0.3319750726222992, "learning_rate": 0.0002, "epoch": 0.3226989365603227, "step": 440}, {"loss": 1.1804, "grad_norm": 0.385148286819458, "learning_rate": 0.0002, "epoch": 0.33003300330033003, "step": 450}, {"loss": 1.1808, "grad_norm": 0.3477935791015625, "learning_rate": 0.0002, "epoch": 0.3373670700403374, "step": 460}, {"loss": 1.1877, "grad_norm": 0.29748716950416565, "learning_rate": 0.0002, "epoch": 0.3447011367803447, "step": 470}, {"loss": 1.19, "grad_norm": 0.34083324670791626, "learning_rate": 0.0002, "epoch": 0.35203520352035206, "step": 480}, {"loss": 1.2, "grad_norm": 0.36904552578926086, "learning_rate": 0.0002, "epoch": 0.35936927026035936, "step": 490}, {"loss": 1.2223, "grad_norm": 0.315483033657074, "learning_rate": 0.0002, "epoch": 0.3667033370003667, "step": 500}, {"loss": 1.1461, "grad_norm": 0.44897955656051636, "learning_rate": 0.0002, "epoch": 0.37403740374037403, "step": 510}, {"loss": 1.3035, "grad_norm": 0.3160701394081116, "learning_rate": 0.0002, "epoch": 0.3813714704803814, "step": 520}, {"loss": 1.3197, "grad_norm": 0.29584741592407227, "learning_rate": 0.0002, "epoch": 0.3887055372203887, "step": 530}, {"loss": 1.2983, "grad_norm": 0.5430002808570862, "learning_rate": 0.0002, "epoch": 0.39603960396039606, "step": 540}, {"loss": 1.2459, "grad_norm": 0.2908070683479309, "learning_rate": 0.0002, "epoch": 0.40337367070040336, "step": 550}, {"loss": 1.2384, "grad_norm": 0.35066530108451843, "learning_rate": 0.0002, "epoch": 0.4107077374404107, "step": 560}, {"loss": 1.1784, "grad_norm": 0.37588003277778625, "learning_rate": 0.0002, "epoch": 0.41804180418041803, "step": 570}, {"loss": 1.2334, "grad_norm": 0.3112126886844635, "learning_rate": 0.0002, "epoch": 0.4253758709204254, "step": 580}, {"loss": 1.1439, "grad_norm": 0.35577139258384705, "learning_rate": 0.0002, "epoch": 0.4327099376604327, "step": 590}, {"loss": 1.184, "grad_norm": 0.31706422567367554, "learning_rate": 0.0002, "epoch": 0.44004400440044006, "step": 600}, {"loss": 1.2081, "grad_norm": 0.3249092102050781, "learning_rate": 0.0002, "epoch": 0.44737807114044736, "step": 610}, {"loss": 1.0824, "grad_norm": 0.3842705488204956, "learning_rate": 0.0002, "epoch": 0.4547121378804547, "step": 620}, {"loss": 1.2257, "grad_norm": 0.390991747379303, "learning_rate": 0.0002, "epoch": 0.46204620462046203, "step": 630}, {"loss": 1.1954, "grad_norm": 0.27532413601875305, "learning_rate": 0.0002, "epoch": 0.4693802713604694, "step": 640}, {"loss": 1.1058, "grad_norm": 0.31412816047668457, "learning_rate": 0.0002, "epoch": 0.4767143381004767, "step": 650}, {"loss": 1.1312, "grad_norm": 0.32117101550102234, "learning_rate": 0.0002, "epoch": 0.48404840484048406, "step": 660}, {"loss": 1.2423, "grad_norm": 0.3810010254383087, "learning_rate": 0.0002, "epoch": 0.49138247158049136, "step": 670}, {"loss": 1.1978, "grad_norm": 0.36289164423942566, "learning_rate": 0.0002, "epoch": 0.4987165383204987, "step": 680}, {"loss": 1.2034, "grad_norm": 0.34458720684051514, "learning_rate": 0.0002, "epoch": 0.506050605060506, "step": 690}, {"loss": 1.1756, "grad_norm": 0.32844600081443787, "learning_rate": 0.0002, "epoch": 0.5133846718005134, "step": 700}, {"loss": 1.0807, "grad_norm": 0.3144175708293915, "learning_rate": 0.0002, "epoch": 0.5207187385405208, "step": 710}, {"loss": 1.1952, "grad_norm": 0.3898887634277344, "learning_rate": 0.0002, "epoch": 0.528052805280528, "step": 720}, {"loss": 1.1244, "grad_norm": 1.3220758438110352, "learning_rate": 0.0002, "epoch": 0.5353868720205354, "step": 730}, {"loss": 1.227, "grad_norm": 0.3635874390602112, "learning_rate": 0.0002, "epoch": 0.5427209387605427, "step": 740}, {"loss": 1.2169, "grad_norm": 0.3138217628002167, "learning_rate": 0.0002, "epoch": 0.5500550055005501, "step": 750}, {"loss": 1.1516, "grad_norm": 0.4063207805156708, "learning_rate": 0.0002, "epoch": 0.5573890722405573, "step": 760}, {"loss": 1.1954, "grad_norm": 0.3926219940185547, "learning_rate": 0.0002, "epoch": 0.5647231389805647, "step": 770}, {"loss": 1.1726, "grad_norm": 0.31954652070999146, "learning_rate": 0.0002, "epoch": 0.5720572057205721, "step": 780}, {"loss": 1.2977, "grad_norm": 0.4248711168766022, "learning_rate": 0.0002, "epoch": 0.5793912724605794, "step": 790}, {"loss": 1.1728, "grad_norm": 0.643004834651947, "learning_rate": 0.0002, "epoch": 0.5867253392005867, "step": 800}, {"loss": 1.1793, "grad_norm": 0.3479592800140381, "learning_rate": 0.0002, "epoch": 0.594059405940594, "step": 810}, {"loss": 1.2426, "grad_norm": 0.4684754014015198, "learning_rate": 0.0002, "epoch": 0.6013934726806014, "step": 820}, {"loss": 1.2002, "grad_norm": 0.3739790916442871, "learning_rate": 0.0002, "epoch": 0.6087275394206088, "step": 830}, {"loss": 1.2139, "grad_norm": 0.40884748101234436, "learning_rate": 0.0002, "epoch": 0.6160616061606161, "step": 840}, {"loss": 1.1557, "grad_norm": 0.9722164273262024, "learning_rate": 0.0002, "epoch": 0.6233956729006234, "step": 850}, {"loss": 1.3069, "grad_norm": 0.42992347478866577, "learning_rate": 0.0002, "epoch": 0.6307297396406307, "step": 860}, {"loss": 1.1339, "grad_norm": 0.36654195189476013, "learning_rate": 0.0002, "epoch": 0.6380638063806381, "step": 870}, {"loss": 1.1932, "grad_norm": 0.4113832116127014, "learning_rate": 0.0002, "epoch": 0.6453978731206454, "step": 880}, {"loss": 1.2163, "grad_norm": 0.2948838770389557, "learning_rate": 0.0002, "epoch": 0.6527319398606527, "step": 890}, {"loss": 1.1081, "grad_norm": 0.38330280780792236, "learning_rate": 0.0002, "epoch": 0.6600660066006601, "step": 900}, {"loss": 1.1342, "grad_norm": 0.4428867697715759, "learning_rate": 0.0002, "epoch": 0.6674000733406674, "step": 910}, {"loss": 1.1021, "grad_norm": 0.23659265041351318, "learning_rate": 0.0002, "epoch": 0.6747341400806748, "step": 920}, {"loss": 1.1226, "grad_norm": 0.323685884475708, "learning_rate": 0.0002, "epoch": 0.682068206820682, "step": 930}, {"loss": 1.0853, "grad_norm": 0.39157727360725403, "learning_rate": 0.0002, "epoch": 0.6894022735606894, "step": 940}, {"loss": 1.1435, "grad_norm": 0.27189481258392334, "learning_rate": 0.0002, "epoch": 0.6967363403006968, "step": 950}, {"loss": 1.1033, "grad_norm": 0.529883861541748, "learning_rate": 0.0002, "epoch": 0.7040704070407041, "step": 960}, {"loss": 1.139, "grad_norm": 0.34758689999580383, "learning_rate": 0.0002, "epoch": 0.7114044737807114, "step": 970}, {"loss": 1.2197, "grad_norm": 0.831749439239502, "learning_rate": 0.0002, "epoch": 0.7187385405207187, "step": 980}, {"loss": 1.158, "grad_norm": 0.4438304007053375, "learning_rate": 0.0002, "epoch": 0.7260726072607261, "step": 990}, {"loss": 1.1021, "grad_norm": 0.33840006589889526, "learning_rate": 0.0002, "epoch": 0.7334066740007334, "step": 1000}, {"loss": 1.254, "grad_norm": 0.3454797863960266, "learning_rate": 0.0002, "epoch": 0.7407407407407407, "step": 1010}, {"loss": 1.106, "grad_norm": 0.38999441266059875, "learning_rate": 0.0002, "epoch": 0.7480748074807481, "step": 1020}, {"loss": 1.1428, "grad_norm": 0.2829911708831787, "learning_rate": 0.0002, "epoch": 0.7554088742207554, "step": 1030}, {"loss": 1.2123, "grad_norm": 0.36918163299560547, "learning_rate": 0.0002, "epoch": 0.7627429409607628, "step": 1040}, {"loss": 1.3028, "grad_norm": 0.3415680229663849, "learning_rate": 0.0002, "epoch": 0.77007700770077, "step": 1050}, {"loss": 1.1939, "grad_norm": 0.2974182963371277, "learning_rate": 0.0002, "epoch": 0.7774110744407774, "step": 1060}, {"loss": 1.194, "grad_norm": 0.3880919814109802, "learning_rate": 0.0002, "epoch": 0.7847451411807848, "step": 1070}, {"loss": 1.1095, "grad_norm": 0.33503302931785583, "learning_rate": 0.0002, "epoch": 0.7920792079207921, "step": 1080}, {"loss": 1.2111, "grad_norm": 0.3728407025337219, "learning_rate": 0.0002, "epoch": 0.7994132746607994, "step": 1090}, {"loss": 1.0835, "grad_norm": 0.3509373664855957, "learning_rate": 0.0002, "epoch": 0.8067473414008067, "step": 1100}, {"loss": 1.2661, "grad_norm": 0.42228564620018005, "learning_rate": 0.0002, "epoch": 0.8140814081408141, "step": 1110}, {"loss": 1.1788, "grad_norm": 0.313467800617218, "learning_rate": 0.0002, "epoch": 0.8214154748808215, "step": 1120}, {"loss": 1.1971, "grad_norm": 0.3378850817680359, "learning_rate": 0.0002, "epoch": 0.8287495416208287, "step": 1130}, {"loss": 1.1238, "grad_norm": 0.43200382590293884, "learning_rate": 0.0002, "epoch": 0.8360836083608361, "step": 1140}, {"loss": 1.3203, "grad_norm": 0.3309599459171295, "learning_rate": 0.0002, "epoch": 0.8434176751008434, "step": 1150}, {"loss": 1.1062, "grad_norm": 0.3526846170425415, "learning_rate": 0.0002, "epoch": 0.8507517418408508, "step": 1160}, {"loss": 1.0851, "grad_norm": 1.2722247838974, "learning_rate": 0.0002, "epoch": 0.858085808580858, "step": 1170}, {"loss": 1.0785, "grad_norm": 0.34142059087753296, "learning_rate": 0.0002, "epoch": 0.8654198753208654, "step": 1180}, {"loss": 1.2187, "grad_norm": 0.3805823028087616, "learning_rate": 0.0002, "epoch": 0.8727539420608728, "step": 1190}, {"loss": 1.1215, "grad_norm": 0.3931232690811157, "learning_rate": 0.0002, "epoch": 0.8800880088008801, "step": 1200}, {"loss": 1.0948, "grad_norm": 0.2937372624874115, "learning_rate": 0.0002, "epoch": 0.8874220755408874, "step": 1210}, {"loss": 1.1228, "grad_norm": 0.3757196366786957, "learning_rate": 0.0002, "epoch": 0.8947561422808947, "step": 1220}, {"loss": 1.1222, "grad_norm": 0.3502705991268158, "learning_rate": 0.0002, "epoch": 0.9020902090209021, "step": 1230}, {"loss": 1.2242, "grad_norm": 0.32758915424346924, "learning_rate": 0.0002, "epoch": 0.9094242757609095, "step": 1240}, {"loss": 1.215, "grad_norm": 0.37199416756629944, "learning_rate": 0.0002, "epoch": 0.9167583425009168, "step": 1250}, {"loss": 1.1225, "grad_norm": 0.3551490604877472, "learning_rate": 0.0002, "epoch": 0.9240924092409241, "step": 1260}, {"loss": 1.1966, "grad_norm": 0.2859550714492798, "learning_rate": 0.0002, "epoch": 0.9314264759809314, "step": 1270}, {"loss": 1.2186, "grad_norm": 0.427990585565567, "learning_rate": 0.0002, "epoch": 0.9387605427209388, "step": 1280}, {"loss": 1.2848, "grad_norm": 0.33717992901802063, "learning_rate": 0.0002, "epoch": 0.9460946094609461, "step": 1290}, {"loss": 1.1656, "grad_norm": 0.30225634574890137, "learning_rate": 0.0002, "epoch": 0.9534286762009534, "step": 1300}, {"loss": 1.2404, "grad_norm": 0.385821133852005, "learning_rate": 0.0002, "epoch": 0.9607627429409608, "step": 1310}, {"loss": 1.1932, "grad_norm": 0.35278066992759705, "learning_rate": 0.0002, "epoch": 0.9680968096809681, "step": 1320}, {"loss": 1.1071, "grad_norm": 0.49987098574638367, "learning_rate": 0.0002, "epoch": 0.9754308764209755, "step": 1330}, {"loss": 1.2259, "grad_norm": 0.3842747211456299, "learning_rate": 0.0002, "epoch": 0.9827649431609827, "step": 1340}, {"loss": 1.0862, "grad_norm": 0.6274653673171997, "learning_rate": 0.0002, "epoch": 0.9900990099009901, "step": 1350}, {"loss": 1.124, "grad_norm": 0.5239808559417725, "learning_rate": 0.0002, "epoch": 0.9974330766409975, "step": 1360}, {"eval_loss": 1.1822267770767212, "eval_runtime": 32.7389, "eval_samples_per_second": 13.165, "eval_steps_per_second": 1.649, "epoch": 0.9996332966629996, "step": 1363}, {"loss": 1.096, "grad_norm": 0.45311301946640015, "learning_rate": 0.0002, "epoch": 1.0047671433810048, "step": 1370}, {"loss": 1.0143, "grad_norm": 0.29685574769973755, "learning_rate": 0.0002, "epoch": 1.012101210121012, "step": 1380}, {"loss": 1.0302, "grad_norm": 0.3290937840938568, "learning_rate": 0.0002, "epoch": 1.0194352768610195, "step": 1390}, {"loss": 1.0295, "grad_norm": 0.3801758587360382, "learning_rate": 0.0002, "epoch": 1.0267693436010268, "step": 1400}, {"loss": 1.1226, "grad_norm": 0.794174313545227, "learning_rate": 0.0002, "epoch": 1.034103410341034, "step": 1410}, {"loss": 1.2232, "grad_norm": 0.3854154646396637, "learning_rate": 0.0002, "epoch": 1.0414374770810415, "step": 1420}, {"loss": 1.0652, "grad_norm": 0.32702451944351196, "learning_rate": 0.0002, "epoch": 1.0487715438210488, "step": 1430}, {"loss": 1.1144, "grad_norm": 0.7815203666687012, "learning_rate": 0.0002, "epoch": 1.056105610561056, "step": 1440}, {"loss": 1.1316, "grad_norm": 0.3087436854839325, "learning_rate": 0.0002, "epoch": 1.0634396773010635, "step": 1450}, {"loss": 1.1124, "grad_norm": 0.3847602903842926, "learning_rate": 0.0002, "epoch": 1.0707737440410707, "step": 1460}, {"loss": 1.1428, "grad_norm": 0.3693031370639801, "learning_rate": 0.0002, "epoch": 1.0781078107810782, "step": 1470}, {"loss": 1.0995, "grad_norm": 0.4111202359199524, "learning_rate": 0.0002, "epoch": 1.0854418775210855, "step": 1480}, {"loss": 1.0961, "grad_norm": 0.41452381014823914, "learning_rate": 0.0002, "epoch": 1.0927759442610927, "step": 1490}, {"loss": 1.1068, "grad_norm": 0.3336445093154907, "learning_rate": 0.0002, "epoch": 1.1001100110011002, "step": 1500}, {"loss": 1.0556, "grad_norm": 0.3923407793045044, "learning_rate": 0.0002, "epoch": 1.1074440777411074, "step": 1510}, {"loss": 1.1644, "grad_norm": 0.46215683221817017, "learning_rate": 0.0002, "epoch": 1.1147781444811147, "step": 1520}, {"loss": 1.1133, "grad_norm": 0.3592156767845154, "learning_rate": 0.0002, "epoch": 1.1221122112211221, "step": 1530}, {"loss": 1.0957, "grad_norm": 0.361110657453537, "learning_rate": 0.0002, "epoch": 1.1294462779611294, "step": 1540}, {"loss": 1.1553, "grad_norm": 0.5317131280899048, "learning_rate": 0.0002, "epoch": 1.1367803447011369, "step": 1550}, {"loss": 1.0368, "grad_norm": 0.3882388174533844, "learning_rate": 0.0002, "epoch": 1.1441144114411441, "step": 1560}, {"loss": 1.0805, "grad_norm": 0.3259428143501282, "learning_rate": 0.0002, "epoch": 1.1514484781811514, "step": 1570}, {"loss": 1.1819, "grad_norm": 0.410935640335083, "learning_rate": 0.0002, "epoch": 1.1587825449211588, "step": 1580}, {"loss": 1.1143, "grad_norm": 0.44940185546875, "learning_rate": 0.0002, "epoch": 1.166116611661166, "step": 1590}, {"loss": 1.0334, "grad_norm": 0.5106484293937683, "learning_rate": 0.0002, "epoch": 1.1734506784011733, "step": 1600}, {"loss": 1.2376, "grad_norm": 0.6603665947914124, "learning_rate": 0.0002, "epoch": 1.1807847451411808, "step": 1610}, {"loss": 1.1227, "grad_norm": 0.4799964129924774, "learning_rate": 0.0002, "epoch": 1.188118811881188, "step": 1620}, {"loss": 1.1191, "grad_norm": 0.4389883279800415, "learning_rate": 0.0002, "epoch": 1.1954528786211955, "step": 1630}, {"loss": 1.0667, "grad_norm": 0.4188813269138336, "learning_rate": 0.0002, "epoch": 1.2027869453612028, "step": 1640}, {"loss": 1.0605, "grad_norm": 0.7132157683372498, "learning_rate": 0.0002, "epoch": 1.21012101210121, "step": 1650}, {"loss": 1.0204, "grad_norm": 0.507480263710022, "learning_rate": 0.0002, "epoch": 1.2174550788412175, "step": 1660}, {"loss": 0.9948, "grad_norm": 0.9452332854270935, "learning_rate": 0.0002, "epoch": 1.2247891455812248, "step": 1670}, {"loss": 1.0228, "grad_norm": 0.4121614992618561, "learning_rate": 0.0002, "epoch": 1.2321232123212322, "step": 1680}, {"loss": 1.0366, "grad_norm": 0.34230247139930725, "learning_rate": 0.0002, "epoch": 1.2394572790612395, "step": 1690}, {"loss": 1.1289, "grad_norm": 0.4026208817958832, "learning_rate": 0.0002, "epoch": 1.2467913458012467, "step": 1700}, {"loss": 1.0206, "grad_norm": 0.46673697233200073, "learning_rate": 0.0002, "epoch": 1.2541254125412542, "step": 1710}, {"loss": 1.0827, "grad_norm": 0.38349825143814087, "learning_rate": 0.0002, "epoch": 1.2614594792812615, "step": 1720}, {"loss": 1.0356, "grad_norm": 0.4049997627735138, "learning_rate": 0.0002, "epoch": 1.2687935460212687, "step": 1730}, {"loss": 0.9504, "grad_norm": 0.3417615294456482, "learning_rate": 0.0002, "epoch": 1.2761276127612762, "step": 1740}, {"loss": 1.094, "grad_norm": 0.4277614951133728, "learning_rate": 0.0002, "epoch": 1.2834616795012834, "step": 1750}, {"loss": 0.9938, "grad_norm": 0.5864202976226807, "learning_rate": 0.0002, "epoch": 1.2907957462412907, "step": 1760}, {"loss": 1.1167, "grad_norm": 0.7097493410110474, "learning_rate": 0.0002, "epoch": 1.2981298129812981, "step": 1770}, {"loss": 1.1132, "grad_norm": 0.3145381212234497, "learning_rate": 0.0002, "epoch": 1.3054638797213054, "step": 1780}, {"loss": 1.1099, "grad_norm": 0.5116165280342102, "learning_rate": 0.0002, "epoch": 1.3127979464613129, "step": 1790}, {"loss": 1.0765, "grad_norm": 0.7469736337661743, "learning_rate": 0.0002, "epoch": 1.3201320132013201, "step": 1800}, {"loss": 1.0663, "grad_norm": 0.32272255420684814, "learning_rate": 0.0002, "epoch": 1.3274660799413276, "step": 1810}, {"loss": 0.9887, "grad_norm": 0.3534623086452484, "learning_rate": 0.0002, "epoch": 1.3348001466813348, "step": 1820}, {"loss": 1.1628, "grad_norm": 0.36127907037734985, "learning_rate": 0.0002, "epoch": 1.342134213421342, "step": 1830}, {"loss": 1.0972, "grad_norm": 0.4072401523590088, "learning_rate": 0.0002, "epoch": 1.3494682801613496, "step": 1840}, {"loss": 1.1267, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.3568023469013568, "step": 1850}, {"loss": 1.0173, "grad_norm": 0.412883460521698, "learning_rate": 0.0002, "epoch": 1.364136413641364, "step": 1860}, {"loss": 1.0265, "grad_norm": 0.3735875189304352, "learning_rate": 0.0002, "epoch": 1.3714704803813715, "step": 1870}, {"loss": 1.1061, "grad_norm": 0.39158159494400024, "learning_rate": 0.0002, "epoch": 1.3788045471213788, "step": 1880}, {"loss": 1.0433, "grad_norm": 0.44431769847869873, "learning_rate": 0.0002, "epoch": 1.386138613861386, "step": 1890}, {"loss": 1.0216, "grad_norm": 0.37772801518440247, "learning_rate": 0.0002, "epoch": 1.3934726806013935, "step": 1900}, {"loss": 1.0674, "grad_norm": 0.4056641757488251, "learning_rate": 0.0002, "epoch": 1.4008067473414008, "step": 1910}, {"loss": 1.0256, "grad_norm": 0.41612377762794495, "learning_rate": 0.0002, "epoch": 1.408140814081408, "step": 1920}, {"loss": 1.0467, "grad_norm": 0.41153013706207275, "learning_rate": 0.0002, "epoch": 1.4154748808214155, "step": 1930}, {"loss": 1.1062, "grad_norm": 0.387845516204834, "learning_rate": 0.0002, "epoch": 1.4228089475614227, "step": 1940}, {"loss": 1.1094, "grad_norm": 0.3809587061405182, "learning_rate": 0.0002, "epoch": 1.4301430143014302, "step": 1950}, {"loss": 1.0461, "grad_norm": 0.3625726103782654, "learning_rate": 0.0002, "epoch": 1.4374770810414375, "step": 1960}, {"loss": 0.9983, "grad_norm": 0.5294290781021118, "learning_rate": 0.0002, "epoch": 1.444811147781445, "step": 1970}, {"loss": 1.1114, "grad_norm": 0.39975494146347046, "learning_rate": 0.0002, "epoch": 1.4521452145214522, "step": 1980}, {"loss": 0.9704, "grad_norm": 0.4181167185306549, "learning_rate": 0.0002, "epoch": 1.4594792812614594, "step": 1990}, {"loss": 1.1146, "grad_norm": 0.42001503705978394, "learning_rate": 0.0002, "epoch": 1.466813348001467, "step": 2000}, {"loss": 1.1266, "grad_norm": 0.4877578616142273, "learning_rate": 0.0002, "epoch": 1.4741474147414741, "step": 2010}, {"loss": 1.1012, "grad_norm": 0.4050969183444977, "learning_rate": 0.0002, "epoch": 1.4814814814814814, "step": 2020}, {"loss": 1.0562, "grad_norm": 0.39068883657455444, "learning_rate": 0.0002, "epoch": 1.4888155482214889, "step": 2030}, {"loss": 1.0464, "grad_norm": 0.421282559633255, "learning_rate": 0.0002, "epoch": 1.4961496149614961, "step": 2040}, {"loss": 1.0532, "grad_norm": 0.47092297673225403, "learning_rate": 0.0002, "epoch": 1.5034836817015034, "step": 2050}, {"loss": 0.9348, "grad_norm": 0.39688974618911743, "learning_rate": 0.0002, "epoch": 1.5108177484415108, "step": 2060}, {"loss": 1.08, "grad_norm": 0.5529879331588745, "learning_rate": 0.0002, "epoch": 1.5181518151815183, "step": 2070}, {"loss": 1.1836, "grad_norm": 0.4879782199859619, "learning_rate": 0.0002, "epoch": 1.5254858819215253, "step": 2080}, {"loss": 1.0432, "grad_norm": 0.5517361164093018, "learning_rate": 0.0002, "epoch": 1.5328199486615328, "step": 2090}, {"loss": 1.0433, "grad_norm": 0.44015637040138245, "learning_rate": 0.0002, "epoch": 1.5401540154015403, "step": 2100}, {"loss": 1.1873, "grad_norm": 0.5435167551040649, "learning_rate": 0.0002, "epoch": 1.5474880821415475, "step": 2110}, {"loss": 1.1076, "grad_norm": 0.5714033246040344, "learning_rate": 0.0002, "epoch": 1.5548221488815548, "step": 2120}, {"loss": 1.1107, "grad_norm": 0.31732529401779175, "learning_rate": 0.0002, "epoch": 1.5621562156215623, "step": 2130}, {"loss": 1.0817, "grad_norm": 0.49068278074264526, "learning_rate": 0.0002, "epoch": 1.5694902823615695, "step": 2140}, {"loss": 1.0254, "grad_norm": 0.46851542592048645, "learning_rate": 0.0002, "epoch": 1.5768243491015768, "step": 2150}, {"loss": 1.0623, "grad_norm": 0.5083092451095581, "learning_rate": 0.0002, "epoch": 1.5841584158415842, "step": 2160}, {"loss": 1.0603, "grad_norm": 0.9822936058044434, "learning_rate": 0.0002, "epoch": 1.5914924825815915, "step": 2170}, {"loss": 0.9986, "grad_norm": 0.4575989246368408, "learning_rate": 0.0002, "epoch": 1.5988265493215987, "step": 2180}, {"loss": 1.1292, "grad_norm": 0.47444286942481995, "learning_rate": 0.0002, "epoch": 1.6061606160616062, "step": 2190}, {"loss": 1.0136, "grad_norm": 0.7208226919174194, "learning_rate": 0.0002, "epoch": 1.6134946828016135, "step": 2200}, {"loss": 1.15, "grad_norm": 0.43791481852531433, "learning_rate": 0.0002, "epoch": 1.6208287495416207, "step": 2210}, {"loss": 1.0961, "grad_norm": 0.5245792865753174, "learning_rate": 0.0002, "epoch": 1.6281628162816282, "step": 2220}, {"loss": 0.9957, "grad_norm": 0.39289429783821106, "learning_rate": 0.0002, "epoch": 1.6354968830216357, "step": 2230}, {"loss": 1.133, "grad_norm": 0.6106135845184326, "learning_rate": 0.0002, "epoch": 1.6428309497616427, "step": 2240}, {"loss": 1.0129, "grad_norm": 0.3722580671310425, "learning_rate": 0.0002, "epoch": 1.6501650165016502, "step": 2250}, {"loss": 1.0446, "grad_norm": 0.3649403750896454, "learning_rate": 0.0002, "epoch": 1.6574990832416576, "step": 2260}, {"loss": 1.0037, "grad_norm": 0.46514248847961426, "learning_rate": 0.0002, "epoch": 1.6648331499816649, "step": 2270}, {"loss": 1.0022, "grad_norm": 0.42034927010536194, "learning_rate": 0.0002, "epoch": 1.6721672167216721, "step": 2280}, {"loss": 1.1362, "grad_norm": 0.45202910900115967, "learning_rate": 0.0002, "epoch": 1.6795012834616796, "step": 2290}, {"loss": 1.0866, "grad_norm": 0.36257603764533997, "learning_rate": 0.0002, "epoch": 1.6868353502016868, "step": 2300}, {"loss": 1.0973, "grad_norm": 0.6340323090553284, "learning_rate": 0.0002, "epoch": 1.694169416941694, "step": 2310}, {"loss": 1.0615, "grad_norm": 0.4352878928184509, "learning_rate": 0.0002, "epoch": 1.7015034836817016, "step": 2320}, {"loss": 1.0629, "grad_norm": 0.45029792189598083, "learning_rate": 0.0002, "epoch": 1.7088375504217088, "step": 2330}, {"loss": 0.9621, "grad_norm": 0.3891315758228302, "learning_rate": 0.0002, "epoch": 1.716171617161716, "step": 2340}, {"loss": 0.9779, "grad_norm": 0.35180050134658813, "learning_rate": 0.0002, "epoch": 1.7235056839017235, "step": 2350}, {"loss": 1.0368, "grad_norm": 0.42367449402809143, "learning_rate": 0.0002, "epoch": 1.7308397506417308, "step": 2360}, {"loss": 1.0376, "grad_norm": 0.4553675353527069, "learning_rate": 0.0002, "epoch": 1.738173817381738, "step": 2370}, {"loss": 1.1467, "grad_norm": 0.5944654941558838, "learning_rate": 0.0002, "epoch": 1.7455078841217455, "step": 2380}, {"loss": 1.0548, "grad_norm": 0.3479664623737335, "learning_rate": 0.0002, "epoch": 1.752841950861753, "step": 2390}, {"loss": 1.0798, "grad_norm": 0.3585502505302429, "learning_rate": 0.0002, "epoch": 1.76017601760176, "step": 2400}, {"loss": 1.0983, "grad_norm": 0.4263346493244171, "learning_rate": 0.0002, "epoch": 1.7675100843417675, "step": 2410}, {"loss": 1.054, "grad_norm": 0.5476409196853638, "learning_rate": 0.0002, "epoch": 1.774844151081775, "step": 2420}, {"loss": 1.1615, "grad_norm": 0.3694186508655548, "learning_rate": 0.0002, "epoch": 1.7821782178217822, "step": 2430}, {"loss": 1.1343, "grad_norm": 0.9185658693313599, "learning_rate": 0.0002, "epoch": 1.7895122845617895, "step": 2440}, {"loss": 1.0764, "grad_norm": 0.7171908020973206, "learning_rate": 0.0002, "epoch": 1.796846351301797, "step": 2450}, {"loss": 1.1154, "grad_norm": 0.550658643245697, "learning_rate": 0.0002, "epoch": 1.8041804180418042, "step": 2460}, {"loss": 0.9975, "grad_norm": 0.4075568914413452, "learning_rate": 0.0002, "epoch": 1.8115144847818114, "step": 2470}, {"loss": 1.0935, "grad_norm": 0.3790127635002136, "learning_rate": 0.0002, "epoch": 1.818848551521819, "step": 2480}, {"loss": 0.9839, "grad_norm": 0.3576384484767914, "learning_rate": 0.0002, "epoch": 1.8261826182618262, "step": 2490}, {"loss": 1.1369, "grad_norm": 0.3919370770454407, "learning_rate": 0.0002, "epoch": 1.8335166850018334, "step": 2500}, {"loss": 0.9985, "grad_norm": 0.485083669424057, "learning_rate": 0.0002, "epoch": 1.8408507517418409, "step": 2510}, {"loss": 1.1585, "grad_norm": 0.4564347565174103, "learning_rate": 0.0002, "epoch": 1.8481848184818483, "step": 2520}, {"loss": 1.0944, "grad_norm": 0.3613106608390808, "learning_rate": 0.0002, "epoch": 1.8555188852218554, "step": 2530}, {"loss": 1.0819, "grad_norm": 0.39600759744644165, "learning_rate": 0.0002, "epoch": 1.8628529519618628, "step": 2540}, {"loss": 0.9453, "grad_norm": 1.123499870300293, "learning_rate": 0.0002, "epoch": 1.8701870187018703, "step": 2550}, {"loss": 1.0635, "grad_norm": 0.4612680673599243, "learning_rate": 0.0002, "epoch": 1.8775210854418776, "step": 2560}, {"loss": 1.0087, "grad_norm": 0.42745399475097656, "learning_rate": 0.0002, "epoch": 1.8848551521818848, "step": 2570}, {"loss": 1.0102, "grad_norm": 0.4055580198764801, "learning_rate": 0.0002, "epoch": 1.8921892189218923, "step": 2580}, {"loss": 1.0177, "grad_norm": 0.44174644351005554, "learning_rate": 0.0002, "epoch": 1.8995232856618995, "step": 2590}, {"loss": 0.9886, "grad_norm": 1.0228385925292969, "learning_rate": 0.0002, "epoch": 1.9068573524019068, "step": 2600}, {"loss": 1.0857, "grad_norm": 0.3496396243572235, "learning_rate": 0.0002, "epoch": 1.9141914191419143, "step": 2610}, {"loss": 1.0955, "grad_norm": 0.4191173017024994, "learning_rate": 0.0002, "epoch": 1.9215254858819215, "step": 2620}, {"loss": 1.0943, "grad_norm": 0.6778554916381836, "learning_rate": 0.0002, "epoch": 1.9288595526219288, "step": 2630}, {"loss": 1.0594, "grad_norm": 0.41992834210395813, "learning_rate": 0.0002, "epoch": 1.9361936193619362, "step": 2640}, {"loss": 1.1159, "grad_norm": 0.8760401010513306, "learning_rate": 0.0002, "epoch": 1.9435276861019435, "step": 2650}, {"loss": 1.0379, "grad_norm": 0.44049209356307983, "learning_rate": 0.0002, "epoch": 1.9508617528419507, "step": 2660}, {"loss": 1.1008, "grad_norm": 0.5651928782463074, "learning_rate": 0.0002, "epoch": 1.9581958195819582, "step": 2670}, {"loss": 1.1317, "grad_norm": 0.5292727947235107, "learning_rate": 0.0002, "epoch": 1.9655298863219657, "step": 2680}, {"loss": 1.1328, "grad_norm": 0.6012240648269653, "learning_rate": 0.0002, "epoch": 1.9728639530619727, "step": 2690}, {"loss": 1.0683, "grad_norm": 0.3945149779319763, "learning_rate": 0.0002, "epoch": 1.9801980198019802, "step": 2700}, {"loss": 1.0155, "grad_norm": 0.5732627511024475, "learning_rate": 0.0002, "epoch": 1.9875320865419877, "step": 2710}, {"loss": 0.9857, "grad_norm": 0.3963361084461212, "learning_rate": 0.0002, "epoch": 1.994866153281995, "step": 2720}, {"eval_loss": 1.1534006595611572, "eval_runtime": 32.7541, "eval_samples_per_second": 13.159, "eval_steps_per_second": 1.649, "epoch": 2.0, "step": 2727}, {"loss": 0.9624, "grad_norm": 0.48628315329551697, "learning_rate": 0.0002, "epoch": 2.002200220022002, "step": 2730}, {"loss": 0.9603, "grad_norm": 0.413875013589859, "learning_rate": 0.0002, "epoch": 2.0095342867620096, "step": 2740}, {"loss": 0.965, "grad_norm": 0.4988735616207123, "learning_rate": 0.0002, "epoch": 2.0168683535020167, "step": 2750}, {"loss": 0.9677, "grad_norm": 0.5634812712669373, "learning_rate": 0.0002, "epoch": 2.024202420242024, "step": 2760}, {"loss": 0.9547, "grad_norm": 0.48302653431892395, "learning_rate": 0.0002, "epoch": 2.0315364869820316, "step": 2770}, {"loss": 0.9346, "grad_norm": 0.49914175271987915, "learning_rate": 0.0002, "epoch": 2.038870553722039, "step": 2780}, {"loss": 0.904, "grad_norm": 1.14039945602417, "learning_rate": 0.0002, "epoch": 2.046204620462046, "step": 2790}, {"loss": 0.9588, "grad_norm": 0.6359720826148987, "learning_rate": 0.0002, "epoch": 2.0535386872020536, "step": 2800}, {"loss": 0.9031, "grad_norm": 0.4589158296585083, "learning_rate": 0.0002, "epoch": 2.060872753942061, "step": 2810}, {"loss": 0.9438, "grad_norm": 0.46255481243133545, "learning_rate": 0.0002, "epoch": 2.068206820682068, "step": 2820}, {"loss": 0.9464, "grad_norm": 0.6232137680053711, "learning_rate": 0.0002, "epoch": 2.0755408874220755, "step": 2830}, {"loss": 0.8978, "grad_norm": 0.41042178869247437, "learning_rate": 0.0002, "epoch": 2.082874954162083, "step": 2840}, {"loss": 0.8516, "grad_norm": 0.5334428548812866, "learning_rate": 0.0002, "epoch": 2.09020902090209, "step": 2850}, {"loss": 0.9313, "grad_norm": 0.8270058631896973, "learning_rate": 0.0002, "epoch": 2.0975430876420975, "step": 2860}, {"loss": 1.0064, "grad_norm": 0.6624533534049988, "learning_rate": 0.0002, "epoch": 2.104877154382105, "step": 2870}, {"loss": 0.9196, "grad_norm": 0.5448863506317139, "learning_rate": 0.0002, "epoch": 2.112211221122112, "step": 2880}, {"loss": 0.887, "grad_norm": 0.621482789516449, "learning_rate": 0.0002, "epoch": 2.1195452878621195, "step": 2890}, {"loss": 0.9702, "grad_norm": 0.4556255340576172, "learning_rate": 0.0002, "epoch": 2.126879354602127, "step": 2900}, {"loss": 0.9323, "grad_norm": 0.4620579183101654, "learning_rate": 0.0002, "epoch": 2.1342134213421344, "step": 2910}, {"loss": 0.836, "grad_norm": 0.9602415561676025, "learning_rate": 0.0002, "epoch": 2.1415474880821415, "step": 2920}, {"loss": 0.8826, "grad_norm": 0.587943971157074, "learning_rate": 0.0002, "epoch": 2.148881554822149, "step": 2930}, {"loss": 0.971, "grad_norm": 0.5121372938156128, "learning_rate": 0.0002, "epoch": 2.1562156215621564, "step": 2940}, {"loss": 0.8751, "grad_norm": 0.49424484372138977, "learning_rate": 0.0002, "epoch": 2.1635496883021634, "step": 2950}, {"loss": 0.8674, "grad_norm": 0.6312560439109802, "learning_rate": 0.0002, "epoch": 2.170883755042171, "step": 2960}, {"loss": 0.9791, "grad_norm": 0.5235576629638672, "learning_rate": 0.0002, "epoch": 2.1782178217821784, "step": 2970}, {"loss": 0.9706, "grad_norm": 0.5868439674377441, "learning_rate": 0.0002, "epoch": 2.1855518885221854, "step": 2980}, {"loss": 0.9338, "grad_norm": 0.42302873730659485, "learning_rate": 0.0002, "epoch": 2.192885955262193, "step": 2990}, {"loss": 0.9332, "grad_norm": 0.5097725987434387, "learning_rate": 0.0002, "epoch": 2.2002200220022003, "step": 3000}, {"loss": 0.9239, "grad_norm": 0.5091572403907776, "learning_rate": 0.0002, "epoch": 2.2075540887422074, "step": 3010}, {"loss": 0.8898, "grad_norm": 0.49433162808418274, "learning_rate": 0.0002, "epoch": 2.214888155482215, "step": 3020}, {"loss": 0.9734, "grad_norm": 0.5577368140220642, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3030}, {"loss": 0.9033, "grad_norm": 0.6177583932876587, "learning_rate": 0.0002, "epoch": 2.2295562889622293, "step": 3040}, {"loss": 0.9882, "grad_norm": 0.5256719589233398, "learning_rate": 0.0002, "epoch": 2.236890355702237, "step": 3050}, {"loss": 0.9439, "grad_norm": 0.5001118183135986, "learning_rate": 0.0002, "epoch": 2.2442244224422443, "step": 3060}, {"loss": 0.8718, "grad_norm": 0.5721249580383301, "learning_rate": 0.0002, "epoch": 2.2515584891822513, "step": 3070}, {"loss": 1.0648, "grad_norm": 0.5325384140014648, "learning_rate": 0.0002, "epoch": 2.258892555922259, "step": 3080}, {"loss": 0.9843, "grad_norm": 0.5719189047813416, "learning_rate": 0.0002, "epoch": 2.2662266226622663, "step": 3090}, {"loss": 0.8633, "grad_norm": 0.6337835788726807, "learning_rate": 0.0002, "epoch": 2.2735606894022737, "step": 3100}, {"loss": 0.9962, "grad_norm": 0.5381836891174316, "learning_rate": 0.0002, "epoch": 2.2808947561422808, "step": 3110}, {"loss": 0.8265, "grad_norm": 0.5408531427383423, "learning_rate": 0.0002, "epoch": 2.2882288228822882, "step": 3120}, {"loss": 1.0325, "grad_norm": 0.43705281615257263, "learning_rate": 0.0002, "epoch": 2.2955628896222957, "step": 3130}, {"loss": 0.9388, "grad_norm": 0.6454030275344849, "learning_rate": 0.0002, "epoch": 2.3028969563623027, "step": 3140}, {"loss": 0.954, "grad_norm": 0.686030387878418, "learning_rate": 0.0002, "epoch": 2.31023102310231, "step": 3150}, {"loss": 0.9403, "grad_norm": 0.5123633146286011, "learning_rate": 0.0002, "epoch": 2.3175650898423177, "step": 3160}, {"loss": 0.8834, "grad_norm": 0.842506468296051, "learning_rate": 0.0002, "epoch": 2.3248991565823247, "step": 3170}, {"loss": 1.0497, "grad_norm": 0.5193818807601929, "learning_rate": 0.0002, "epoch": 2.332233223322332, "step": 3180}, {"loss": 0.9473, "grad_norm": 0.5634409189224243, "learning_rate": 0.0002, "epoch": 2.3395672900623397, "step": 3190}, {"loss": 0.8499, "grad_norm": 0.6475534439086914, "learning_rate": 0.0002, "epoch": 2.3469013568023467, "step": 3200}, {"loss": 0.874, "grad_norm": 1.1503914594650269, "learning_rate": 0.0002, "epoch": 2.354235423542354, "step": 3210}, {"loss": 0.9762, "grad_norm": 0.7234905362129211, "learning_rate": 0.0002, "epoch": 2.3615694902823616, "step": 3220}, {"loss": 0.9007, "grad_norm": 0.664903461933136, "learning_rate": 0.0002, "epoch": 2.368903557022369, "step": 3230}, {"loss": 0.9987, "grad_norm": 0.5453006625175476, "learning_rate": 0.0002, "epoch": 2.376237623762376, "step": 3240}, {"loss": 0.9742, "grad_norm": 0.6256654262542725, "learning_rate": 0.0002, "epoch": 2.3835716905023836, "step": 3250}, {"loss": 0.9922, "grad_norm": 0.5166565179824829, "learning_rate": 0.0002, "epoch": 2.390905757242391, "step": 3260}, {"loss": 0.927, "grad_norm": 0.5699098110198975, "learning_rate": 0.0002, "epoch": 2.398239823982398, "step": 3270}, {"loss": 0.8878, "grad_norm": 0.4472540020942688, "learning_rate": 0.0002, "epoch": 2.4055738907224056, "step": 3280}, {"loss": 0.9439, "grad_norm": 0.6790403127670288, "learning_rate": 0.0002, "epoch": 2.412907957462413, "step": 3290}, {"loss": 0.972, "grad_norm": 0.5182185173034668, "learning_rate": 0.0002, "epoch": 2.42024202420242, "step": 3300}, {"loss": 0.9775, "grad_norm": 0.564647912979126, "learning_rate": 0.0002, "epoch": 2.4275760909424275, "step": 3310}, {"loss": 1.072, "grad_norm": 0.5625313520431519, "learning_rate": 0.0002, "epoch": 2.434910157682435, "step": 3320}, {"loss": 0.8798, "grad_norm": 0.7496559619903564, "learning_rate": 0.0002, "epoch": 2.442244224422442, "step": 3330}, {"loss": 0.868, "grad_norm": 0.4779128134250641, "learning_rate": 0.0002, "epoch": 2.4495782911624495, "step": 3340}, {"loss": 1.0316, "grad_norm": 0.578093409538269, "learning_rate": 0.0002, "epoch": 2.456912357902457, "step": 3350}, {"loss": 0.9282, "grad_norm": 0.5456080436706543, "learning_rate": 0.0002, "epoch": 2.4642464246424645, "step": 3360}, {"loss": 0.8409, "grad_norm": 0.4769273102283478, "learning_rate": 0.0002, "epoch": 2.4715804913824715, "step": 3370}, {"loss": 0.9312, "grad_norm": 0.5608189702033997, "learning_rate": 0.0002, "epoch": 2.478914558122479, "step": 3380}, {"loss": 0.9934, "grad_norm": 0.5590165853500366, "learning_rate": 0.0002, "epoch": 2.4862486248624864, "step": 3390}, {"loss": 1.025, "grad_norm": 0.801306962966919, "learning_rate": 0.0002, "epoch": 2.4935826916024935, "step": 3400}, {"loss": 0.9049, "grad_norm": 0.6045624613761902, "learning_rate": 0.0002, "epoch": 2.500916758342501, "step": 3410}, {"loss": 0.944, "grad_norm": 0.5735858082771301, "learning_rate": 0.0002, "epoch": 2.5082508250825084, "step": 3420}, {"loss": 0.9846, "grad_norm": 0.6827309131622314, "learning_rate": 0.0002, "epoch": 2.5155848918225154, "step": 3430}, {"loss": 0.9789, "grad_norm": 0.5702602863311768, "learning_rate": 0.0002, "epoch": 2.522918958562523, "step": 3440}, {"loss": 0.9127, "grad_norm": 0.6674721240997314, "learning_rate": 0.0002, "epoch": 2.5302530253025304, "step": 3450}, {"loss": 0.914, "grad_norm": 0.5635907649993896, "learning_rate": 0.0002, "epoch": 2.5375870920425374, "step": 3460}, {"loss": 0.8398, "grad_norm": 0.42737770080566406, "learning_rate": 0.0002, "epoch": 2.544921158782545, "step": 3470}, {"loss": 0.9474, "grad_norm": 0.6720691919326782, "learning_rate": 0.0002, "epoch": 2.5522552255225524, "step": 3480}, {"loss": 0.8637, "grad_norm": 0.8917084336280823, "learning_rate": 0.0002, "epoch": 2.55958929226256, "step": 3490}, {"loss": 0.9257, "grad_norm": 0.5134549140930176, "learning_rate": 0.0002, "epoch": 2.566923359002567, "step": 3500}, {"loss": 0.9362, "grad_norm": 0.4951367974281311, "learning_rate": 0.0002, "epoch": 2.5742574257425743, "step": 3510}, {"loss": 0.9184, "grad_norm": 0.9438204765319824, "learning_rate": 0.0002, "epoch": 2.5815914924825814, "step": 3520}, {"loss": 0.8939, "grad_norm": 0.6024714708328247, "learning_rate": 0.0002, "epoch": 2.588925559222589, "step": 3530}, {"loss": 0.9298, "grad_norm": 0.5248535871505737, "learning_rate": 0.0002, "epoch": 2.5962596259625963, "step": 3540}, {"loss": 0.941, "grad_norm": 0.8677568435668945, "learning_rate": 0.0002, "epoch": 2.6035936927026038, "step": 3550}, {"loss": 0.9253, "grad_norm": 0.82008296251297, "learning_rate": 0.0002, "epoch": 2.610927759442611, "step": 3560}, {"loss": 0.8429, "grad_norm": 0.4724634885787964, "learning_rate": 0.0002, "epoch": 2.6182618261826183, "step": 3570}, {"loss": 0.9058, "grad_norm": 0.5434244275093079, "learning_rate": 0.0002, "epoch": 2.6255958929226257, "step": 3580}, {"loss": 0.9379, "grad_norm": 0.4948740005493164, "learning_rate": 0.0002, "epoch": 2.6329299596626328, "step": 3590}, {"loss": 0.8718, "grad_norm": 0.42109328508377075, "learning_rate": 0.0002, "epoch": 2.6402640264026402, "step": 3600}, {"loss": 0.9809, "grad_norm": 0.7979786396026611, "learning_rate": 0.0002, "epoch": 2.6475980931426477, "step": 3610}, {"loss": 0.9229, "grad_norm": 0.6345919370651245, "learning_rate": 0.0002, "epoch": 2.654932159882655, "step": 3620}, {"loss": 0.8506, "grad_norm": 0.4971671402454376, "learning_rate": 0.0002, "epoch": 2.662266226622662, "step": 3630}, {"loss": 0.8054, "grad_norm": 0.6467748284339905, "learning_rate": 0.0002, "epoch": 2.6696002933626697, "step": 3640}, {"loss": 0.9277, "grad_norm": 0.4240160286426544, "learning_rate": 0.0002, "epoch": 2.6769343601026767, "step": 3650}, {"loss": 0.8213, "grad_norm": 0.5179754495620728, "learning_rate": 0.0002, "epoch": 2.684268426842684, "step": 3660}, {"loss": 0.9221, "grad_norm": 0.754012405872345, "learning_rate": 0.0002, "epoch": 2.6916024935826917, "step": 3670}, {"loss": 0.9194, "grad_norm": 0.5141299962997437, "learning_rate": 0.0002, "epoch": 2.698936560322699, "step": 3680}, {"loss": 0.9495, "grad_norm": 0.5737819075584412, "learning_rate": 0.0002, "epoch": 2.706270627062706, "step": 3690}, {"loss": 1.0162, "grad_norm": 0.5887577533721924, "learning_rate": 0.0002, "epoch": 2.7136046938027136, "step": 3700}, {"loss": 0.9169, "grad_norm": 0.6740471720695496, "learning_rate": 0.0002, "epoch": 2.720938760542721, "step": 3710}, {"loss": 0.9297, "grad_norm": 0.5879453420639038, "learning_rate": 0.0002, "epoch": 2.728272827282728, "step": 3720}, {"loss": 0.9358, "grad_norm": 0.4858354926109314, "learning_rate": 0.0002, "epoch": 2.7356068940227356, "step": 3730}, {"loss": 0.9308, "grad_norm": 0.5489001870155334, "learning_rate": 0.0002, "epoch": 2.742940960762743, "step": 3740}, {"loss": 0.894, "grad_norm": 0.8187092542648315, "learning_rate": 0.0002, "epoch": 2.7502750275027505, "step": 3750}, {"loss": 0.8954, "grad_norm": 0.5666626691818237, "learning_rate": 0.0002, "epoch": 2.7576090942427576, "step": 3760}, {"loss": 1.0059, "grad_norm": 0.5377066135406494, "learning_rate": 0.0002, "epoch": 2.764943160982765, "step": 3770}, {"loss": 0.9132, "grad_norm": 0.566330075263977, "learning_rate": 0.0002, "epoch": 2.772277227722772, "step": 3780}, {"loss": 0.9415, "grad_norm": 0.5522832870483398, "learning_rate": 0.0002, "epoch": 2.7796112944627795, "step": 3790}, {"loss": 0.8816, "grad_norm": 0.5668695569038391, "learning_rate": 0.0002, "epoch": 2.786945361202787, "step": 3800}, {"loss": 0.8885, "grad_norm": 0.7566602826118469, "learning_rate": 0.0002, "epoch": 2.7942794279427945, "step": 3810}, {"loss": 0.8598, "grad_norm": 0.5603684782981873, "learning_rate": 0.0002, "epoch": 2.8016134946828015, "step": 3820}, {"loss": 0.9602, "grad_norm": 0.49122217297554016, "learning_rate": 0.0002, "epoch": 2.808947561422809, "step": 3830}, {"loss": 0.9738, "grad_norm": 0.6798251867294312, "learning_rate": 0.0002, "epoch": 2.816281628162816, "step": 3840}, {"loss": 0.9533, "grad_norm": 0.6097991466522217, "learning_rate": 0.0002, "epoch": 2.8236156949028235, "step": 3850}, {"loss": 0.8672, "grad_norm": 0.6675726175308228, "learning_rate": 0.0002, "epoch": 2.830949761642831, "step": 3860}, {"loss": 0.9324, "grad_norm": 0.9223952889442444, "learning_rate": 0.0002, "epoch": 2.8382838283828384, "step": 3870}, {"loss": 0.8767, "grad_norm": 0.6020799875259399, "learning_rate": 0.0002, "epoch": 2.8456178951228455, "step": 3880}, {"loss": 0.9148, "grad_norm": 0.5206381678581238, "learning_rate": 0.0002, "epoch": 2.852951961862853, "step": 3890}, {"loss": 0.9479, "grad_norm": 0.6268777251243591, "learning_rate": 0.0002, "epoch": 2.8602860286028604, "step": 3900}, {"loss": 0.9409, "grad_norm": 1.1583497524261475, "learning_rate": 0.0002, "epoch": 2.8676200953428674, "step": 3910}, {"loss": 0.895, "grad_norm": 0.7263903021812439, "learning_rate": 0.0002, "epoch": 2.874954162082875, "step": 3920}, {"loss": 0.8786, "grad_norm": 0.5369910001754761, "learning_rate": 0.0002, "epoch": 2.8822882288228824, "step": 3930}, {"loss": 1.0015, "grad_norm": 0.7298350930213928, "learning_rate": 0.0002, "epoch": 2.88962229556289, "step": 3940}, {"loss": 0.979, "grad_norm": 0.577012836933136, "learning_rate": 0.0002, "epoch": 2.896956362302897, "step": 3950}, {"loss": 0.9716, "grad_norm": 0.5859594345092773, "learning_rate": 0.0002, "epoch": 2.9042904290429044, "step": 3960}, {"loss": 0.8772, "grad_norm": 0.47176122665405273, "learning_rate": 0.0002, "epoch": 2.9116244957829114, "step": 3970}, {"loss": 0.8997, "grad_norm": 0.9699620604515076, "learning_rate": 0.0002, "epoch": 2.918958562522919, "step": 3980}, {"loss": 0.9057, "grad_norm": 0.7908747792243958, "learning_rate": 0.0002, "epoch": 2.9262926292629263, "step": 3990}, {"loss": 0.9462, "grad_norm": 0.5777379274368286, "learning_rate": 0.0002, "epoch": 2.933626696002934, "step": 4000}, {"loss": 0.9358, "grad_norm": 0.599288284778595, "learning_rate": 0.0002, "epoch": 2.940960762742941, "step": 4010}, {"loss": 0.9812, "grad_norm": 0.5232274532318115, "learning_rate": 0.0002, "epoch": 2.9482948294829483, "step": 4020}, {"loss": 0.96, "grad_norm": 0.6395137310028076, "learning_rate": 0.0002, "epoch": 2.9556288962229558, "step": 4030}, {"loss": 0.9813, "grad_norm": 0.589260458946228, "learning_rate": 0.0002, "epoch": 2.962962962962963, "step": 4040}, {"loss": 0.9541, "grad_norm": 0.5699581503868103, "learning_rate": 0.0002, "epoch": 2.9702970297029703, "step": 4050}, {"loss": 0.9585, "grad_norm": 0.528468132019043, "learning_rate": 0.0002, "epoch": 2.9776310964429777, "step": 4060}, {"loss": 0.9164, "grad_norm": 0.4804670512676239, "learning_rate": 0.0002, "epoch": 2.984965163182985, "step": 4070}, {"loss": 0.9771, "grad_norm": 1.1918889284133911, "learning_rate": 0.0002, "epoch": 2.9922992299229922, "step": 4080}, {"loss": 0.9178, "grad_norm": 0.5479103326797485, "learning_rate": 0.0002, "epoch": 2.9996332966629997, "step": 4090}, {"eval_loss": 1.1642853021621704, "eval_runtime": 32.7511, "eval_samples_per_second": 13.16, "eval_steps_per_second": 1.649, "epoch": 2.9996332966629997, "step": 4090}, {"loss": 0.7981, "grad_norm": 0.7430027723312378, "learning_rate": 0.0002, "epoch": 3.006967363403007, "step": 4100}, {"loss": 0.7871, "grad_norm": 0.6293647289276123, "learning_rate": 0.0002, "epoch": 3.014301430143014, "step": 4110}, {"loss": 0.78, "grad_norm": 0.6191329956054688, "learning_rate": 0.0002, "epoch": 3.0216354968830217, "step": 4120}, {"loss": 0.7618, "grad_norm": 0.7959313988685608, "learning_rate": 0.0002, "epoch": 3.028969563623029, "step": 4130}, {"loss": 0.8039, "grad_norm": 0.5956351161003113, "learning_rate": 0.0002, "epoch": 3.036303630363036, "step": 4140}, {"loss": 0.7477, "grad_norm": 0.670383632183075, "learning_rate": 0.0002, "epoch": 3.0436376971030437, "step": 4150}, {"loss": 0.7984, "grad_norm": 0.6414518356323242, "learning_rate": 0.0002, "epoch": 3.050971763843051, "step": 4160}, {"loss": 0.7369, "grad_norm": 0.7928852438926697, "learning_rate": 0.0002, "epoch": 3.058305830583058, "step": 4170}, {"loss": 0.7914, "grad_norm": 0.6211121082305908, "learning_rate": 0.0002, "epoch": 3.0656398973230656, "step": 4180}, {"loss": 0.7365, "grad_norm": 0.6237057447433472, "learning_rate": 0.0002, "epoch": 3.072973964063073, "step": 4190}, {"loss": 0.702, "grad_norm": 0.6522233486175537, "learning_rate": 0.0002, "epoch": 3.08030803080308, "step": 4200}, {"loss": 0.7646, "grad_norm": 0.9396848678588867, "learning_rate": 0.0002, "epoch": 3.0876420975430876, "step": 4210}, {"loss": 0.7559, "grad_norm": 0.8003010749816895, "learning_rate": 0.0002, "epoch": 3.094976164283095, "step": 4220}, {"loss": 0.711, "grad_norm": 0.6733810305595398, "learning_rate": 0.0002, "epoch": 3.102310231023102, "step": 4230}, {"loss": 0.696, "grad_norm": 0.6365828514099121, "learning_rate": 0.0002, "epoch": 3.1096442977631096, "step": 4240}, {"loss": 0.8362, "grad_norm": 1.0805548429489136, "learning_rate": 0.0002, "epoch": 3.116978364503117, "step": 4250}, {"loss": 0.7651, "grad_norm": 0.7262141108512878, "learning_rate": 0.0002, "epoch": 3.1243124312431245, "step": 4260}, {"loss": 0.7304, "grad_norm": 0.5500539541244507, "learning_rate": 0.0002, "epoch": 3.1316464979831315, "step": 4270}, {"loss": 0.7721, "grad_norm": 0.793912947177887, "learning_rate": 0.0002, "epoch": 3.138980564723139, "step": 4280}, {"loss": 0.7708, "grad_norm": 1.2540518045425415, "learning_rate": 0.0002, "epoch": 3.1463146314631465, "step": 4290}, {"loss": 0.782, "grad_norm": 0.7020077705383301, "learning_rate": 0.0002, "epoch": 3.1536486982031535, "step": 4300}, {"loss": 0.7253, "grad_norm": 0.5111123323440552, "learning_rate": 0.0002, "epoch": 3.160982764943161, "step": 4310}, {"loss": 0.8159, "grad_norm": 0.7172090411186218, "learning_rate": 0.0002, "epoch": 3.1683168316831685, "step": 4320}, {"loss": 0.6962, "grad_norm": 0.6343168616294861, "learning_rate": 0.0002, "epoch": 3.1756508984231755, "step": 4330}, {"loss": 0.7938, "grad_norm": 0.9563672542572021, "learning_rate": 0.0002, "epoch": 3.182984965163183, "step": 4340}, {"loss": 0.7385, "grad_norm": 1.0225574970245361, "learning_rate": 0.0002, "epoch": 3.1903190319031904, "step": 4350}, {"loss": 0.8652, "grad_norm": 1.1633386611938477, "learning_rate": 0.0002, "epoch": 3.1976530986431975, "step": 4360}, {"loss": 0.7259, "grad_norm": 0.8915148973464966, "learning_rate": 0.0002, "epoch": 3.204987165383205, "step": 4370}, {"loss": 0.8061, "grad_norm": 0.9156812429428101, "learning_rate": 0.0002, "epoch": 3.2123212321232124, "step": 4380}, {"loss": 0.8189, "grad_norm": 0.6363258957862854, "learning_rate": 0.0002, "epoch": 3.21965529886322, "step": 4390}, {"loss": 0.7996, "grad_norm": 0.579099178314209, "learning_rate": 0.0002, "epoch": 3.226989365603227, "step": 4400}, {"loss": 0.8592, "grad_norm": 0.8778146505355835, "learning_rate": 0.0002, "epoch": 3.2343234323432344, "step": 4410}, {"loss": 0.8281, "grad_norm": 0.8356770873069763, "learning_rate": 0.0002, "epoch": 3.241657499083242, "step": 4420}, {"loss": 0.8484, "grad_norm": 0.702032208442688, "learning_rate": 0.0002, "epoch": 3.248991565823249, "step": 4430}, {"loss": 0.7227, "grad_norm": 0.6386539340019226, "learning_rate": 0.0002, "epoch": 3.2563256325632564, "step": 4440}, {"loss": 0.8374, "grad_norm": 0.7008408904075623, "learning_rate": 0.0002, "epoch": 3.263659699303264, "step": 4450}, {"loss": 0.7572, "grad_norm": 0.9556332230567932, "learning_rate": 0.0002, "epoch": 3.270993766043271, "step": 4460}, {"loss": 0.743, "grad_norm": 0.5667835474014282, "learning_rate": 0.0002, "epoch": 3.2783278327832783, "step": 4470}, {"loss": 0.8152, "grad_norm": 0.8239172697067261, "learning_rate": 0.0002, "epoch": 3.285661899523286, "step": 4480}, {"loss": 0.756, "grad_norm": 0.7045050859451294, "learning_rate": 0.0002, "epoch": 3.292995966263293, "step": 4490}, {"loss": 0.7655, "grad_norm": 0.7131434082984924, "learning_rate": 0.0002, "epoch": 3.3003300330033003, "step": 4500}, {"loss": 0.836, "grad_norm": 0.6924910545349121, "learning_rate": 0.0002, "epoch": 3.3076640997433078, "step": 4510}, {"loss": 0.736, "grad_norm": 0.8945356607437134, "learning_rate": 0.0002, "epoch": 3.3149981664833152, "step": 4520}, {"loss": 0.7575, "grad_norm": 0.6546903252601624, "learning_rate": 0.0002, "epoch": 3.3223322332233223, "step": 4530}, {"loss": 0.7893, "grad_norm": 0.8206679224967957, "learning_rate": 0.0002, "epoch": 3.3296662999633297, "step": 4540}, {"loss": 0.7502, "grad_norm": 0.6482203602790833, "learning_rate": 0.0002, "epoch": 3.3370003667033368, "step": 4550}, {"loss": 0.8172, "grad_norm": 0.7558760046958923, "learning_rate": 0.0002, "epoch": 3.3443344334433442, "step": 4560}, {"loss": 0.744, "grad_norm": 0.7794756889343262, "learning_rate": 0.0002, "epoch": 3.3516685001833517, "step": 4570}, {"loss": 0.7385, "grad_norm": 0.7382805943489075, "learning_rate": 0.0002, "epoch": 3.359002566923359, "step": 4580}, {"loss": 0.8511, "grad_norm": 0.5912511944770813, "learning_rate": 0.0002, "epoch": 3.366336633663366, "step": 4590}, {"loss": 0.8272, "grad_norm": 0.7444885969161987, "learning_rate": 0.0002, "epoch": 3.3736707004033737, "step": 4600}, {"loss": 0.7927, "grad_norm": 0.7354922890663147, "learning_rate": 0.0002, "epoch": 3.381004767143381, "step": 4610}, {"loss": 0.7183, "grad_norm": 0.7685934901237488, "learning_rate": 0.0002, "epoch": 3.388338833883388, "step": 4620}, {"loss": 0.7436, "grad_norm": 0.61041259765625, "learning_rate": 0.0002, "epoch": 3.3956729006233957, "step": 4630}, {"loss": 0.7661, "grad_norm": 0.6820451021194458, "learning_rate": 0.0002, "epoch": 3.403006967363403, "step": 4640}, {"loss": 0.8796, "grad_norm": 0.5819534063339233, "learning_rate": 0.0002, "epoch": 3.41034103410341, "step": 4650}, {"loss": 0.7314, "grad_norm": 0.705410897731781, "learning_rate": 0.0002, "epoch": 3.4176751008434176, "step": 4660}, {"loss": 0.7901, "grad_norm": 0.8052892088890076, "learning_rate": 0.0002, "epoch": 3.425009167583425, "step": 4670}, {"loss": 0.7298, "grad_norm": 0.7746483087539673, "learning_rate": 0.0002, "epoch": 3.432343234323432, "step": 4680}, {"loss": 0.7976, "grad_norm": 0.7713689804077148, "learning_rate": 0.0002, "epoch": 3.4396773010634396, "step": 4690}, {"loss": 0.7427, "grad_norm": 0.810371994972229, "learning_rate": 0.0002, "epoch": 3.447011367803447, "step": 4700}, {"loss": 0.7594, "grad_norm": 0.7702969312667847, "learning_rate": 0.0002, "epoch": 3.4543454345434546, "step": 4710}, {"loss": 0.7957, "grad_norm": 0.7069268822669983, "learning_rate": 0.0002, "epoch": 3.4616795012834616, "step": 4720}, {"loss": 0.8199, "grad_norm": 0.7640359401702881, "learning_rate": 0.0002, "epoch": 3.469013568023469, "step": 4730}, {"loss": 0.6875, "grad_norm": 0.8661707639694214, "learning_rate": 0.0002, "epoch": 3.4763476347634765, "step": 4740}, {"loss": 0.8528, "grad_norm": 0.9970282912254333, "learning_rate": 0.0002, "epoch": 3.4836817015034836, "step": 4750}, {"loss": 0.8462, "grad_norm": 0.5824355483055115, "learning_rate": 0.0002, "epoch": 3.491015768243491, "step": 4760}, {"loss": 0.851, "grad_norm": 1.3072649240493774, "learning_rate": 0.0002, "epoch": 3.4983498349834985, "step": 4770}, {"loss": 0.9101, "grad_norm": 0.873978316783905, "learning_rate": 0.0002, "epoch": 3.5056839017235055, "step": 4780}, {"loss": 0.7403, "grad_norm": 0.5526657104492188, "learning_rate": 0.0002, "epoch": 3.513017968463513, "step": 4790}, {"loss": 0.7921, "grad_norm": 0.790894627571106, "learning_rate": 0.0002, "epoch": 3.5203520352035205, "step": 4800}, {"loss": 0.831, "grad_norm": 0.8119630217552185, "learning_rate": 0.0002, "epoch": 3.5276861019435275, "step": 4810}, {"loss": 0.7351, "grad_norm": 0.633212149143219, "learning_rate": 0.0002, "epoch": 3.535020168683535, "step": 4820}, {"loss": 0.8505, "grad_norm": 0.703029990196228, "learning_rate": 0.0002, "epoch": 3.5423542354235424, "step": 4830}, {"loss": 0.7204, "grad_norm": 0.7603771686553955, "learning_rate": 0.0002, "epoch": 3.54968830216355, "step": 4840}, {"loss": 0.8868, "grad_norm": 0.6260480880737305, "learning_rate": 0.0002, "epoch": 3.557022368903557, "step": 4850}, {"loss": 0.8137, "grad_norm": 0.8203664422035217, "learning_rate": 0.0002, "epoch": 3.5643564356435644, "step": 4860}, {"loss": 0.8821, "grad_norm": 0.7793813347816467, "learning_rate": 0.0002, "epoch": 3.5716905023835714, "step": 4870}, {"loss": 0.8164, "grad_norm": 0.7667397260665894, "learning_rate": 0.0002, "epoch": 3.579024569123579, "step": 4880}, {"loss": 0.7597, "grad_norm": 0.8198829889297485, "learning_rate": 0.0002, "epoch": 3.5863586358635864, "step": 4890}, {"loss": 0.7027, "grad_norm": 0.7689233422279358, "learning_rate": 0.0002, "epoch": 3.593692702603594, "step": 4900}, {"loss": 0.804, "grad_norm": 0.7870983481407166, "learning_rate": 0.0002, "epoch": 3.601026769343601, "step": 4910}, {"loss": 0.8269, "grad_norm": 0.8133853077888489, "learning_rate": 0.0002, "epoch": 3.6083608360836084, "step": 4920}, {"loss": 0.8515, "grad_norm": 1.308401346206665, "learning_rate": 0.0002, "epoch": 3.615694902823616, "step": 4930}, {"loss": 0.8494, "grad_norm": 0.7131121754646301, "learning_rate": 0.0002, "epoch": 3.623028969563623, "step": 4940}, {"loss": 0.7235, "grad_norm": 0.6825910210609436, "learning_rate": 0.0002, "epoch": 3.6303630363036303, "step": 4950}, {"loss": 0.7824, "grad_norm": 0.7254678606987, "learning_rate": 0.0002, "epoch": 3.637697103043638, "step": 4960}, {"loss": 0.7983, "grad_norm": 0.8045085072517395, "learning_rate": 0.0002, "epoch": 3.6450311697836453, "step": 4970}, {"loss": 0.8223, "grad_norm": 0.6991777420043945, "learning_rate": 0.0002, "epoch": 3.6523652365236523, "step": 4980}, {"loss": 0.7806, "grad_norm": 0.7804713249206543, "learning_rate": 0.0002, "epoch": 3.6596993032636598, "step": 4990}, {"loss": 0.8402, "grad_norm": 0.8525708317756653, "learning_rate": 0.0002, "epoch": 3.667033370003667, "step": 5000}, {"loss": 0.8496, "grad_norm": 0.7959994673728943, "learning_rate": 0.0002, "epoch": 3.6743674367436743, "step": 5010}, {"loss": 0.8022, "grad_norm": 0.8103628158569336, "learning_rate": 0.0002, "epoch": 3.6817015034836817, "step": 5020}, {"loss": 0.7376, "grad_norm": 0.7517836093902588, "learning_rate": 0.0002, "epoch": 3.689035570223689, "step": 5030}, {"loss": 0.8375, "grad_norm": 0.6878514289855957, "learning_rate": 0.0002, "epoch": 3.6963696369636962, "step": 5040}, {"loss": 0.7998, "grad_norm": 1.2371820211410522, "learning_rate": 0.0002, "epoch": 3.7037037037037037, "step": 5050}, {"loss": 0.6941, "grad_norm": 0.6567103862762451, "learning_rate": 0.0002, "epoch": 3.711037770443711, "step": 5060}, {"loss": 0.8465, "grad_norm": 1.1254922151565552, "learning_rate": 0.0002, "epoch": 3.718371837183718, "step": 5070}, {"loss": 0.8365, "grad_norm": 0.6796132326126099, "learning_rate": 0.0002, "epoch": 3.7257059039237257, "step": 5080}, {"loss": 0.7818, "grad_norm": 0.7285300493240356, "learning_rate": 0.0002, "epoch": 3.733039970663733, "step": 5090}, {"loss": 0.8581, "grad_norm": 0.8931500911712646, "learning_rate": 0.0002, "epoch": 3.7403740374037406, "step": 5100}, {"loss": 0.8181, "grad_norm": 0.6256856918334961, "learning_rate": 0.0002, "epoch": 3.7477081041437477, "step": 5110}, {"loss": 0.743, "grad_norm": 0.79310142993927, "learning_rate": 0.0002, "epoch": 3.755042170883755, "step": 5120}, {"loss": 0.8235, "grad_norm": 0.6594041585922241, "learning_rate": 0.0002, "epoch": 3.762376237623762, "step": 5130}, {"loss": 0.6925, "grad_norm": 0.7029327750205994, "learning_rate": 0.0002, "epoch": 3.7697103043637696, "step": 5140}, {"loss": 0.7457, "grad_norm": 0.5880070328712463, "learning_rate": 0.0002, "epoch": 3.777044371103777, "step": 5150}, {"loss": 0.8716, "grad_norm": 0.7578945159912109, "learning_rate": 0.0002, "epoch": 3.7843784378437846, "step": 5160}, {"loss": 0.8819, "grad_norm": 0.8276378512382507, "learning_rate": 0.0002, "epoch": 3.7917125045837916, "step": 5170}, {"loss": 0.7559, "grad_norm": 0.7627953886985779, "learning_rate": 0.0002, "epoch": 3.799046571323799, "step": 5180}, {"loss": 0.7665, "grad_norm": 0.8169086575508118, "learning_rate": 0.0002, "epoch": 3.806380638063806, "step": 5190}, {"loss": 0.761, "grad_norm": 0.6605030298233032, "learning_rate": 0.0002, "epoch": 3.8137147048038136, "step": 5200}, {"loss": 0.8804, "grad_norm": 0.5837286114692688, "learning_rate": 0.0002, "epoch": 3.821048771543821, "step": 5210}, {"loss": 0.8369, "grad_norm": 1.2422157526016235, "learning_rate": 0.0002, "epoch": 3.8283828382838285, "step": 5220}, {"loss": 0.8431, "grad_norm": 0.6589220762252808, "learning_rate": 0.0002, "epoch": 3.8357169050238356, "step": 5230}, {"loss": 0.7686, "grad_norm": 0.8567556142807007, "learning_rate": 0.0002, "epoch": 3.843050971763843, "step": 5240}, {"loss": 0.8652, "grad_norm": 0.6490627527236938, "learning_rate": 0.0002, "epoch": 3.8503850385038505, "step": 5250}, {"loss": 0.7386, "grad_norm": 0.620232880115509, "learning_rate": 0.0002, "epoch": 3.8577191052438575, "step": 5260}, {"loss": 0.9192, "grad_norm": 0.7685128450393677, "learning_rate": 0.0002, "epoch": 3.865053171983865, "step": 5270}, {"loss": 0.872, "grad_norm": 0.8113296627998352, "learning_rate": 0.0002, "epoch": 3.8723872387238725, "step": 5280}, {"loss": 0.7156, "grad_norm": 0.8092675805091858, "learning_rate": 0.0002, "epoch": 3.87972130546388, "step": 5290}, {"loss": 0.7325, "grad_norm": 0.583570122718811, "learning_rate": 0.0002, "epoch": 3.887055372203887, "step": 5300}, {"loss": 0.9333, "grad_norm": 1.712363600730896, "learning_rate": 0.0002, "epoch": 3.8943894389438944, "step": 5310}, {"loss": 0.7537, "grad_norm": 0.6673534512519836, "learning_rate": 0.0002, "epoch": 3.9017235056839015, "step": 5320}, {"loss": 0.7035, "grad_norm": 1.9770312309265137, "learning_rate": 0.0002, "epoch": 3.909057572423909, "step": 5330}, {"loss": 0.8793, "grad_norm": 0.6430999636650085, "learning_rate": 0.0002, "epoch": 3.9163916391639164, "step": 5340}, {"loss": 0.839, "grad_norm": 1.0159571170806885, "learning_rate": 0.0002, "epoch": 3.923725705903924, "step": 5350}, {"loss": 0.9332, "grad_norm": 0.8607584834098816, "learning_rate": 0.0002, "epoch": 3.931059772643931, "step": 5360}, {"loss": 0.7261, "grad_norm": 0.6967900991439819, "learning_rate": 0.0002, "epoch": 3.9383938393839384, "step": 5370}, {"loss": 0.8456, "grad_norm": 0.7683077454566956, "learning_rate": 0.0002, "epoch": 3.945727906123946, "step": 5380}, {"loss": 0.7682, "grad_norm": 0.6805762648582458, "learning_rate": 0.0002, "epoch": 3.953061972863953, "step": 5390}, {"loss": 0.7746, "grad_norm": 0.7033619284629822, "learning_rate": 0.0002, "epoch": 3.9603960396039604, "step": 5400}, {"loss": 0.8393, "grad_norm": 0.966112494468689, "learning_rate": 0.0002, "epoch": 3.967730106343968, "step": 5410}, {"loss": 0.8316, "grad_norm": 0.8467881083488464, "learning_rate": 0.0002, "epoch": 3.9750641730839753, "step": 5420}, {"loss": 0.8084, "grad_norm": 0.8005317449569702, "learning_rate": 0.0002, "epoch": 3.9823982398239823, "step": 5430}, {"loss": 0.7168, "grad_norm": 1.1615241765975952, "learning_rate": 0.0002, "epoch": 3.98973230656399, "step": 5440}, {"loss": 0.8263, "grad_norm": 0.6121614575386047, "learning_rate": 0.0002, "epoch": 3.997066373303997, "step": 5450}, {"eval_loss": 1.1834222078323364, "eval_runtime": 32.7569, "eval_samples_per_second": 13.158, "eval_steps_per_second": 1.649, "epoch": 4.0, "step": 5454}, {"loss": 0.7267, "grad_norm": 0.6055727005004883, "learning_rate": 0.0002, "epoch": 4.004400440044004, "step": 5460}, {"loss": 0.5766, "grad_norm": 0.8232647180557251, "learning_rate": 0.0002, "epoch": 4.011734506784012, "step": 5470}, {"loss": 0.6489, "grad_norm": 0.7739192247390747, "learning_rate": 0.0002, "epoch": 4.019068573524019, "step": 5480}, {"loss": 0.5978, "grad_norm": 0.6264950633049011, "learning_rate": 0.0002, "epoch": 4.026402640264027, "step": 5490}, {"loss": 0.6392, "grad_norm": 1.4798702001571655, "learning_rate": 0.0002, "epoch": 4.033736707004033, "step": 5500}, {"loss": 0.6143, "grad_norm": 0.9538470506668091, "learning_rate": 0.0002, "epoch": 4.041070773744041, "step": 5510}, {"loss": 0.6056, "grad_norm": 0.834561288356781, "learning_rate": 0.0002, "epoch": 4.048404840484048, "step": 5520}, {"loss": 0.6077, "grad_norm": 0.6407850384712219, "learning_rate": 0.0002, "epoch": 4.055738907224056, "step": 5530}, {"loss": 0.6733, "grad_norm": 0.9035961627960205, "learning_rate": 0.0002, "epoch": 4.063072973964063, "step": 5540}, {"loss": 0.5854, "grad_norm": 0.842812716960907, "learning_rate": 0.0002, "epoch": 4.070407040704071, "step": 5550}, {"loss": 0.654, "grad_norm": 0.8197882175445557, "learning_rate": 0.0002, "epoch": 4.077741107444078, "step": 5560}, {"loss": 0.5919, "grad_norm": 0.8652673959732056, "learning_rate": 0.0002, "epoch": 4.085075174184085, "step": 5570}, {"loss": 0.6188, "grad_norm": 0.8048318028450012, "learning_rate": 0.0002, "epoch": 4.092409240924092, "step": 5580}, {"loss": 0.6487, "grad_norm": 0.9604969024658203, "learning_rate": 0.0002, "epoch": 4.0997433076641, "step": 5590}, {"loss": 0.6356, "grad_norm": 1.244756817817688, "learning_rate": 0.0002, "epoch": 4.107077374404107, "step": 5600}, {"loss": 0.6489, "grad_norm": 0.7975269556045532, "learning_rate": 0.0002, "epoch": 4.114411441144115, "step": 5610}, {"loss": 0.6445, "grad_norm": 0.6130099296569824, "learning_rate": 0.0002, "epoch": 4.121745507884122, "step": 5620}, {"loss": 0.6024, "grad_norm": 0.7793202996253967, "learning_rate": 0.0002, "epoch": 4.129079574624129, "step": 5630}, {"loss": 0.5723, "grad_norm": 1.187238335609436, "learning_rate": 0.0002, "epoch": 4.136413641364136, "step": 5640}, {"loss": 0.6385, "grad_norm": 0.8450375199317932, "learning_rate": 0.0002, "epoch": 4.143747708104144, "step": 5650}, {"loss": 0.6866, "grad_norm": 0.9006940126419067, "learning_rate": 0.0002, "epoch": 4.151081774844151, "step": 5660}, {"loss": 0.6179, "grad_norm": 0.9447154998779297, "learning_rate": 0.0002, "epoch": 4.158415841584159, "step": 5670}, {"loss": 0.6476, "grad_norm": 0.798032283782959, "learning_rate": 0.0002, "epoch": 4.165749908324166, "step": 5680}, {"loss": 0.6666, "grad_norm": 0.65578693151474, "learning_rate": 0.0002, "epoch": 4.1730839750641735, "step": 5690}, {"loss": 0.701, "grad_norm": 1.0864700078964233, "learning_rate": 0.0002, "epoch": 4.18041804180418, "step": 5700}, {"loss": 0.6895, "grad_norm": 0.7344121932983398, "learning_rate": 0.0002, "epoch": 4.187752108544188, "step": 5710}, {"loss": 0.6659, "grad_norm": 0.9722456932067871, "learning_rate": 0.0002, "epoch": 4.195086175284195, "step": 5720}, {"loss": 0.6887, "grad_norm": 1.263814926147461, "learning_rate": 0.0002, "epoch": 4.2024202420242025, "step": 5730}, {"loss": 0.608, "grad_norm": 0.9622581005096436, "learning_rate": 0.0002, "epoch": 4.20975430876421, "step": 5740}, {"loss": 0.6221, "grad_norm": 0.8497143387794495, "learning_rate": 0.0002, "epoch": 4.2170883755042174, "step": 5750}, {"loss": 0.6322, "grad_norm": 0.8248446583747864, "learning_rate": 0.0002, "epoch": 4.224422442244224, "step": 5760}, {"loss": 0.6045, "grad_norm": 1.2544798851013184, "learning_rate": 0.0002, "epoch": 4.2317565089842315, "step": 5770}, {"loss": 0.641, "grad_norm": 0.8224676251411438, "learning_rate": 0.0002, "epoch": 4.239090575724239, "step": 5780}, {"loss": 0.6399, "grad_norm": 0.8924877047538757, "learning_rate": 0.0002, "epoch": 4.2464246424642464, "step": 5790}, {"loss": 0.6845, "grad_norm": 0.8545848727226257, "learning_rate": 0.0002, "epoch": 4.253758709204254, "step": 5800}, {"loss": 0.6669, "grad_norm": 0.8081067800521851, "learning_rate": 0.0002, "epoch": 4.261092775944261, "step": 5810}, {"loss": 0.6149, "grad_norm": 0.7111002802848816, "learning_rate": 0.0002, "epoch": 4.268426842684269, "step": 5820}, {"loss": 0.6343, "grad_norm": 0.8696979880332947, "learning_rate": 0.0002, "epoch": 4.2757609094242754, "step": 5830}, {"loss": 0.6384, "grad_norm": 0.821401834487915, "learning_rate": 0.0002, "epoch": 4.283094976164283, "step": 5840}, {"loss": 0.6912, "grad_norm": 0.888908326625824, "learning_rate": 0.0002, "epoch": 4.29042904290429, "step": 5850}, {"loss": 0.6061, "grad_norm": 1.9380123615264893, "learning_rate": 0.0002, "epoch": 4.297763109644298, "step": 5860}, {"loss": 0.6766, "grad_norm": 1.121774435043335, "learning_rate": 0.0002, "epoch": 4.305097176384305, "step": 5870}, {"loss": 0.7205, "grad_norm": 0.9238282442092896, "learning_rate": 0.0002, "epoch": 4.312431243124313, "step": 5880}, {"loss": 0.6351, "grad_norm": 0.7321620583534241, "learning_rate": 0.0002, "epoch": 4.319765309864319, "step": 5890}, {"loss": 0.6404, "grad_norm": 0.8739548325538635, "learning_rate": 0.0002, "epoch": 4.327099376604327, "step": 5900}, {"loss": 0.5892, "grad_norm": 0.9686012268066406, "learning_rate": 0.0002, "epoch": 4.334433443344334, "step": 5910}, {"loss": 0.641, "grad_norm": 0.9033839106559753, "learning_rate": 0.0002, "epoch": 4.341767510084342, "step": 5920}, {"loss": 0.6456, "grad_norm": 0.8131115436553955, "learning_rate": 0.0002, "epoch": 4.349101576824349, "step": 5930}, {"loss": 0.5826, "grad_norm": 0.8942412734031677, "learning_rate": 0.0002, "epoch": 4.356435643564357, "step": 5940}, {"loss": 0.7336, "grad_norm": 0.8439112901687622, "learning_rate": 0.0002, "epoch": 4.363769710304364, "step": 5950}, {"loss": 0.6537, "grad_norm": 0.9176713228225708, "learning_rate": 0.0002, "epoch": 4.371103777044371, "step": 5960}, {"loss": 0.6792, "grad_norm": 0.6799634695053101, "learning_rate": 0.0002, "epoch": 4.378437843784378, "step": 5970}, {"loss": 0.7266, "grad_norm": 1.0435824394226074, "learning_rate": 0.0002, "epoch": 4.385771910524386, "step": 5980}, {"loss": 0.68, "grad_norm": 0.997937798500061, "learning_rate": 0.0002, "epoch": 4.393105977264393, "step": 5990}, {"loss": 0.6604, "grad_norm": 1.0308842658996582, "learning_rate": 0.0002, "epoch": 4.400440044004401, "step": 6000}, {"loss": 0.6402, "grad_norm": 1.3683775663375854, "learning_rate": 0.0002, "epoch": 4.407774110744408, "step": 6010}, {"loss": 0.7027, "grad_norm": 0.7569534182548523, "learning_rate": 0.0002, "epoch": 4.415108177484415, "step": 6020}, {"loss": 0.5949, "grad_norm": 1.089978575706482, "learning_rate": 0.0002, "epoch": 4.422442244224422, "step": 6030}, {"loss": 0.6353, "grad_norm": 0.7522459626197815, "learning_rate": 0.0002, "epoch": 4.42977631096443, "step": 6040}, {"loss": 0.5852, "grad_norm": 0.6709823608398438, "learning_rate": 0.0002, "epoch": 4.437110377704437, "step": 6050}, {"loss": 0.6718, "grad_norm": 0.6992089748382568, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6060}, {"loss": 0.6933, "grad_norm": 1.0182931423187256, "learning_rate": 0.0002, "epoch": 4.451778511184452, "step": 6070}, {"loss": 0.6255, "grad_norm": 1.0685160160064697, "learning_rate": 0.0002, "epoch": 4.459112577924459, "step": 6080}, {"loss": 0.6086, "grad_norm": 0.8295124769210815, "learning_rate": 0.0002, "epoch": 4.466446644664466, "step": 6090}, {"loss": 0.6359, "grad_norm": 1.1862998008728027, "learning_rate": 0.0002, "epoch": 4.473780711404474, "step": 6100}, {"loss": 0.638, "grad_norm": 0.7400273084640503, "learning_rate": 0.0002, "epoch": 4.481114778144481, "step": 6110}, {"loss": 0.6854, "grad_norm": 0.7098417282104492, "learning_rate": 0.0002, "epoch": 4.488448844884489, "step": 6120}, {"loss": 0.6976, "grad_norm": 0.9745053648948669, "learning_rate": 0.0002, "epoch": 4.495782911624496, "step": 6130}, {"loss": 0.605, "grad_norm": 0.8638797998428345, "learning_rate": 0.0002, "epoch": 4.503116978364503, "step": 6140}, {"loss": 0.6491, "grad_norm": 0.8291046619415283, "learning_rate": 0.0002, "epoch": 4.51045104510451, "step": 6150}, {"loss": 0.6457, "grad_norm": 1.0301737785339355, "learning_rate": 0.0002, "epoch": 4.517785111844518, "step": 6160}, {"loss": 0.6742, "grad_norm": 1.1996512413024902, "learning_rate": 0.0002, "epoch": 4.525119178584525, "step": 6170}, {"loss": 0.6484, "grad_norm": 1.151038408279419, "learning_rate": 0.0002, "epoch": 4.5324532453245325, "step": 6180}, {"loss": 0.668, "grad_norm": 0.8385201096534729, "learning_rate": 0.0002, "epoch": 4.53978731206454, "step": 6190}, {"loss": 0.6381, "grad_norm": 0.8969188332557678, "learning_rate": 0.0002, "epoch": 4.5471213788045475, "step": 6200}, {"loss": 0.7141, "grad_norm": 1.60659658908844, "learning_rate": 0.0002, "epoch": 4.554455445544555, "step": 6210}, {"loss": 0.6388, "grad_norm": 0.9356731176376343, "learning_rate": 0.0002, "epoch": 4.5617895122845615, "step": 6220}, {"loss": 0.7393, "grad_norm": 0.95856773853302, "learning_rate": 0.0002, "epoch": 4.569123579024569, "step": 6230}, {"loss": 0.6554, "grad_norm": 1.1162524223327637, "learning_rate": 0.0002, "epoch": 4.5764576457645765, "step": 6240}, {"loss": 0.6012, "grad_norm": 0.8809238076210022, "learning_rate": 0.0002, "epoch": 4.583791712504584, "step": 6250}, {"loss": 0.648, "grad_norm": 0.890738844871521, "learning_rate": 0.0002, "epoch": 4.591125779244591, "step": 6260}, {"loss": 0.6663, "grad_norm": 0.918684720993042, "learning_rate": 0.0002, "epoch": 4.598459845984598, "step": 6270}, {"loss": 0.5992, "grad_norm": 0.8156296610832214, "learning_rate": 0.0002, "epoch": 4.6057939127246055, "step": 6280}, {"loss": 0.723, "grad_norm": 1.046634316444397, "learning_rate": 0.0002, "epoch": 4.613127979464613, "step": 6290}, {"loss": 0.7023, "grad_norm": 0.7725525498390198, "learning_rate": 0.0002, "epoch": 4.62046204620462, "step": 6300}, {"loss": 0.6414, "grad_norm": 0.9992046356201172, "learning_rate": 0.0002, "epoch": 4.627796112944628, "step": 6310}, {"loss": 0.6201, "grad_norm": 0.8480095267295837, "learning_rate": 0.0002, "epoch": 4.635130179684635, "step": 6320}, {"loss": 0.6869, "grad_norm": 0.7061955332756042, "learning_rate": 0.0002, "epoch": 4.642464246424643, "step": 6330}, {"loss": 0.6828, "grad_norm": 1.0354212522506714, "learning_rate": 0.0002, "epoch": 4.649798313164649, "step": 6340}, {"loss": 0.6651, "grad_norm": 1.0081377029418945, "learning_rate": 0.0002, "epoch": 4.657132379904657, "step": 6350}, {"loss": 0.726, "grad_norm": 1.2904249429702759, "learning_rate": 0.0002, "epoch": 4.664466446644664, "step": 6360}, {"loss": 0.7148, "grad_norm": 0.9248910546302795, "learning_rate": 0.0002, "epoch": 4.671800513384672, "step": 6370}, {"loss": 0.6961, "grad_norm": 0.9907804131507874, "learning_rate": 0.0002, "epoch": 4.679134580124679, "step": 6380}, {"loss": 0.6163, "grad_norm": 1.201143741607666, "learning_rate": 0.0002, "epoch": 4.686468646864687, "step": 6390}, {"loss": 0.6762, "grad_norm": 0.8709394335746765, "learning_rate": 0.0002, "epoch": 4.693802713604693, "step": 6400}, {"loss": 0.7217, "grad_norm": 0.7468608021736145, "learning_rate": 0.0002, "epoch": 4.701136780344701, "step": 6410}, {"loss": 0.6548, "grad_norm": 0.8607903718948364, "learning_rate": 0.0002, "epoch": 4.708470847084708, "step": 6420}, {"loss": 0.6449, "grad_norm": 0.9840512871742249, "learning_rate": 0.0002, "epoch": 4.715804913824716, "step": 6430}, {"loss": 0.685, "grad_norm": 0.8328204154968262, "learning_rate": 0.0002, "epoch": 4.723138980564723, "step": 6440}, {"loss": 0.697, "grad_norm": 0.924505352973938, "learning_rate": 0.0002, "epoch": 4.730473047304731, "step": 6450}, {"loss": 0.7422, "grad_norm": 0.8897685408592224, "learning_rate": 0.0002, "epoch": 4.737807114044738, "step": 6460}, {"loss": 0.6842, "grad_norm": 0.9605024456977844, "learning_rate": 0.0002, "epoch": 4.745141180784745, "step": 6470}, {"loss": 0.6488, "grad_norm": 0.8150759935379028, "learning_rate": 0.0002, "epoch": 4.752475247524752, "step": 6480}, {"loss": 0.6606, "grad_norm": 0.8128412961959839, "learning_rate": 0.0002, "epoch": 4.75980931426476, "step": 6490}, {"loss": 0.6729, "grad_norm": 0.7381404638290405, "learning_rate": 0.0002, "epoch": 4.767143381004767, "step": 6500}, {"loss": 0.6713, "grad_norm": 1.0565853118896484, "learning_rate": 0.0002, "epoch": 4.774477447744775, "step": 6510}, {"loss": 0.6496, "grad_norm": 0.9298134446144104, "learning_rate": 0.0002, "epoch": 4.781811514484782, "step": 6520}, {"loss": 0.7279, "grad_norm": 1.0145525932312012, "learning_rate": 0.0002, "epoch": 4.789145581224789, "step": 6530}, {"loss": 0.5986, "grad_norm": 0.92259681224823, "learning_rate": 0.0002, "epoch": 4.796479647964796, "step": 6540}, {"loss": 0.63, "grad_norm": 0.7881024479866028, "learning_rate": 0.0002, "epoch": 4.803813714704804, "step": 6550}, {"loss": 0.7134, "grad_norm": 1.4935206174850464, "learning_rate": 0.0002, "epoch": 4.811147781444811, "step": 6560}, {"loss": 0.6695, "grad_norm": 0.8612369298934937, "learning_rate": 0.0002, "epoch": 4.818481848184819, "step": 6570}, {"loss": 0.779, "grad_norm": 1.0118653774261475, "learning_rate": 0.0002, "epoch": 4.825815914924826, "step": 6580}, {"loss": 0.6991, "grad_norm": 1.1303809881210327, "learning_rate": 0.0002, "epoch": 4.833149981664834, "step": 6590}, {"loss": 0.7887, "grad_norm": 0.9112492203712463, "learning_rate": 0.0002, "epoch": 4.84048404840484, "step": 6600}, {"loss": 0.7699, "grad_norm": 0.864762544631958, "learning_rate": 0.0002, "epoch": 4.847818115144848, "step": 6610}, {"loss": 0.7347, "grad_norm": 0.9090572595596313, "learning_rate": 0.0002, "epoch": 4.855152181884855, "step": 6620}, {"loss": 0.6608, "grad_norm": 1.014953374862671, "learning_rate": 0.0002, "epoch": 4.862486248624863, "step": 6630}, {"loss": 0.6429, "grad_norm": 1.0702149868011475, "learning_rate": 0.0002, "epoch": 4.86982031536487, "step": 6640}, {"loss": 0.6943, "grad_norm": 1.002135157585144, "learning_rate": 0.0002, "epoch": 4.8771543821048775, "step": 6650}, {"loss": 0.7225, "grad_norm": 0.862545907497406, "learning_rate": 0.0002, "epoch": 4.884488448844884, "step": 6660}, {"loss": 0.6206, "grad_norm": 0.7302131056785583, "learning_rate": 0.0002, "epoch": 4.891822515584892, "step": 6670}, {"loss": 0.7175, "grad_norm": 0.8380730152130127, "learning_rate": 0.0002, "epoch": 4.899156582324899, "step": 6680}, {"loss": 0.645, "grad_norm": 0.7956018447875977, "learning_rate": 0.0002, "epoch": 4.9064906490649065, "step": 6690}, {"loss": 0.6431, "grad_norm": 0.6717583537101746, "learning_rate": 0.0002, "epoch": 4.913824715804914, "step": 6700}, {"loss": 0.6942, "grad_norm": 1.09099280834198, "learning_rate": 0.0002, "epoch": 4.9211587825449215, "step": 6710}, {"loss": 0.7533, "grad_norm": 0.8589889407157898, "learning_rate": 0.0002, "epoch": 4.928492849284929, "step": 6720}, {"loss": 0.66, "grad_norm": 1.0046314001083374, "learning_rate": 0.0002, "epoch": 4.9358269160249355, "step": 6730}, {"loss": 0.6864, "grad_norm": 0.8559659123420715, "learning_rate": 0.0002, "epoch": 4.943160982764943, "step": 6740}, {"loss": 0.6847, "grad_norm": 0.8588525652885437, "learning_rate": 0.0002, "epoch": 4.9504950495049505, "step": 6750}, {"loss": 0.6428, "grad_norm": 0.9192708134651184, "learning_rate": 0.0002, "epoch": 4.957829116244958, "step": 6760}, {"loss": 0.6873, "grad_norm": 1.051398754119873, "learning_rate": 0.0002, "epoch": 4.965163182984965, "step": 6770}, {"loss": 0.7249, "grad_norm": 0.9111362099647522, "learning_rate": 0.0002, "epoch": 4.972497249724973, "step": 6780}, {"loss": 0.7613, "grad_norm": 0.7305638194084167, "learning_rate": 0.0002, "epoch": 4.9798313164649795, "step": 6790}, {"loss": 0.6747, "grad_norm": 1.118837594985962, "learning_rate": 0.0002, "epoch": 4.987165383204987, "step": 6800}, {"loss": 0.6412, "grad_norm": 0.9075239300727844, "learning_rate": 0.0002, "epoch": 4.994499449944994, "step": 6810}, {"eval_loss": 1.2361247539520264, "eval_runtime": 32.7325, "eval_samples_per_second": 13.167, "eval_steps_per_second": 1.65, "epoch": 4.999633296662999, "step": 6817}, {"loss": 0.7091, "grad_norm": 1.0541315078735352, "learning_rate": 0.0002, "epoch": 5.001833516685002, "step": 6820}, {"loss": 0.4882, "grad_norm": 0.9750140905380249, "learning_rate": 0.0002, "epoch": 5.009167583425009, "step": 6830}, {"loss": 0.6022, "grad_norm": 0.931838870048523, "learning_rate": 0.0002, "epoch": 5.016501650165017, "step": 6840}, {"loss": 0.5194, "grad_norm": 1.110278844833374, "learning_rate": 0.0002, "epoch": 5.023835716905023, "step": 6850}, {"loss": 0.4676, "grad_norm": 1.0670180320739746, "learning_rate": 0.0002, "epoch": 5.031169783645031, "step": 6860}, {"loss": 0.4374, "grad_norm": 0.8762092590332031, "learning_rate": 0.0002, "epoch": 5.038503850385038, "step": 6870}, {"loss": 0.505, "grad_norm": 1.1169432401657104, "learning_rate": 0.0002, "epoch": 5.045837917125046, "step": 6880}, {"loss": 0.5114, "grad_norm": 1.005491018295288, "learning_rate": 0.0002, "epoch": 5.053171983865053, "step": 6890}, {"loss": 0.5221, "grad_norm": 1.1751841306686401, "learning_rate": 0.0002, "epoch": 5.060506050605061, "step": 6900}, {"loss": 0.451, "grad_norm": 0.8501367568969727, "learning_rate": 0.0002, "epoch": 5.067840117345068, "step": 6910}, {"loss": 0.5292, "grad_norm": 0.9795131683349609, "learning_rate": 0.0002, "epoch": 5.075174184085075, "step": 6920}, {"loss": 0.5234, "grad_norm": 0.8929879665374756, "learning_rate": 0.0002, "epoch": 5.082508250825082, "step": 6930}, {"loss": 0.5378, "grad_norm": 1.0156651735305786, "learning_rate": 0.0002, "epoch": 5.08984231756509, "step": 6940}, {"loss": 0.5241, "grad_norm": 1.0974335670471191, "learning_rate": 0.0002, "epoch": 5.097176384305097, "step": 6950}, {"loss": 0.5705, "grad_norm": 1.7015666961669922, "learning_rate": 0.0002, "epoch": 5.104510451045105, "step": 6960}, {"loss": 0.523, "grad_norm": 1.0343226194381714, "learning_rate": 0.0002, "epoch": 5.111844517785112, "step": 6970}, {"loss": 0.4616, "grad_norm": 1.3072983026504517, "learning_rate": 0.0002, "epoch": 5.119178584525119, "step": 6980}, {"loss": 0.4813, "grad_norm": 1.038986086845398, "learning_rate": 0.0002, "epoch": 5.126512651265126, "step": 6990}, {"loss": 0.4616, "grad_norm": 0.8638386130332947, "learning_rate": 0.0002, "epoch": 5.133846718005134, "step": 7000}, {"loss": 0.5294, "grad_norm": 0.8326523900032043, "learning_rate": 0.0002, "epoch": 5.141180784745141, "step": 7010}, {"loss": 0.5021, "grad_norm": 1.0976895093917847, "learning_rate": 0.0002, "epoch": 5.148514851485149, "step": 7020}, {"loss": 0.4677, "grad_norm": 1.0077873468399048, "learning_rate": 0.0002, "epoch": 5.155848918225156, "step": 7030}, {"loss": 0.5262, "grad_norm": 1.0662257671356201, "learning_rate": 0.0002, "epoch": 5.163182984965164, "step": 7040}, {"loss": 0.5484, "grad_norm": 1.206271767616272, "learning_rate": 0.0002, "epoch": 5.17051705170517, "step": 7050}, {"loss": 0.4817, "grad_norm": 1.1990262269973755, "learning_rate": 0.0002, "epoch": 5.177851118445178, "step": 7060}, {"loss": 0.6048, "grad_norm": 1.0207163095474243, "learning_rate": 0.0002, "epoch": 5.185185185185185, "step": 7070}, {"loss": 0.4816, "grad_norm": 1.2783987522125244, "learning_rate": 0.0002, "epoch": 5.192519251925193, "step": 7080}, {"loss": 0.5322, "grad_norm": 1.1592512130737305, "learning_rate": 0.0002, "epoch": 5.1998533186652, "step": 7090}, {"loss": 0.5472, "grad_norm": 1.1053160429000854, "learning_rate": 0.0002, "epoch": 5.2071873854052075, "step": 7100}, {"loss": 0.4986, "grad_norm": 1.1925510168075562, "learning_rate": 0.0002, "epoch": 5.214521452145214, "step": 7110}, {"loss": 0.5065, "grad_norm": 1.0714877843856812, "learning_rate": 0.0002, "epoch": 5.221855518885222, "step": 7120}, {"loss": 0.5209, "grad_norm": 0.9451011419296265, "learning_rate": 0.0002, "epoch": 5.229189585625229, "step": 7130}, {"loss": 0.5298, "grad_norm": 1.03838050365448, "learning_rate": 0.0002, "epoch": 5.2365236523652365, "step": 7140}, {"loss": 0.4848, "grad_norm": 0.9204146265983582, "learning_rate": 0.0002, "epoch": 5.243857719105244, "step": 7150}, {"loss": 0.5164, "grad_norm": 1.0142229795455933, "learning_rate": 0.0002, "epoch": 5.2511917858452515, "step": 7160}, {"loss": 0.5092, "grad_norm": 1.4432005882263184, "learning_rate": 0.0002, "epoch": 5.258525852585258, "step": 7170}, {"loss": 0.5133, "grad_norm": 1.1239633560180664, "learning_rate": 0.0002, "epoch": 5.2658599193252655, "step": 7180}, {"loss": 0.4969, "grad_norm": 0.7012821435928345, "learning_rate": 0.0002, "epoch": 5.273193986065273, "step": 7190}, {"loss": 0.5466, "grad_norm": 1.3499128818511963, "learning_rate": 0.0002, "epoch": 5.2805280528052805, "step": 7200}, {"loss": 0.5282, "grad_norm": 0.9498730897903442, "learning_rate": 0.0002, "epoch": 5.287862119545288, "step": 7210}, {"loss": 0.5051, "grad_norm": 0.9552369117736816, "learning_rate": 0.0002, "epoch": 5.295196186285295, "step": 7220}, {"loss": 0.5329, "grad_norm": 0.7610348463058472, "learning_rate": 0.0002, "epoch": 5.302530253025303, "step": 7230}, {"loss": 0.468, "grad_norm": 1.0314512252807617, "learning_rate": 0.0002, "epoch": 5.3098643197653095, "step": 7240}, {"loss": 0.5367, "grad_norm": 1.0534334182739258, "learning_rate": 0.0002, "epoch": 5.317198386505317, "step": 7250}, {"loss": 0.5491, "grad_norm": 1.2553406953811646, "learning_rate": 0.0002, "epoch": 5.324532453245324, "step": 7260}, {"loss": 0.5218, "grad_norm": 0.7061691880226135, "learning_rate": 0.0002, "epoch": 5.331866519985332, "step": 7270}, {"loss": 0.5625, "grad_norm": 0.9652578830718994, "learning_rate": 0.0002, "epoch": 5.339200586725339, "step": 7280}, {"loss": 0.5608, "grad_norm": 1.114788293838501, "learning_rate": 0.0002, "epoch": 5.346534653465347, "step": 7290}, {"loss": 0.578, "grad_norm": 1.0940049886703491, "learning_rate": 0.0002, "epoch": 5.353868720205353, "step": 7300}, {"loss": 0.5256, "grad_norm": 1.0151008367538452, "learning_rate": 0.0002, "epoch": 5.361202786945361, "step": 7310}, {"loss": 0.5377, "grad_norm": 1.0369552373886108, "learning_rate": 0.0002, "epoch": 5.368536853685368, "step": 7320}, {"loss": 0.5028, "grad_norm": 0.8489866256713867, "learning_rate": 0.0002, "epoch": 5.375870920425376, "step": 7330}, {"loss": 0.5937, "grad_norm": 1.1031713485717773, "learning_rate": 0.0002, "epoch": 5.383204987165383, "step": 7340}, {"loss": 0.5355, "grad_norm": 0.9094716310501099, "learning_rate": 0.0002, "epoch": 5.390539053905391, "step": 7350}, {"loss": 0.5406, "grad_norm": 0.9530431032180786, "learning_rate": 0.0002, "epoch": 5.397873120645398, "step": 7360}, {"loss": 0.529, "grad_norm": 0.9633604884147644, "learning_rate": 0.0002, "epoch": 5.405207187385405, "step": 7370}, {"loss": 0.5315, "grad_norm": 0.9541662335395813, "learning_rate": 0.0002, "epoch": 5.412541254125412, "step": 7380}, {"loss": 0.6774, "grad_norm": 1.0459771156311035, "learning_rate": 0.0002, "epoch": 5.41987532086542, "step": 7390}, {"loss": 0.5737, "grad_norm": 1.027388334274292, "learning_rate": 0.0002, "epoch": 5.427209387605427, "step": 7400}, {"loss": 0.556, "grad_norm": 0.7267653346061707, "learning_rate": 0.0002, "epoch": 5.434543454345435, "step": 7410}, {"loss": 0.4581, "grad_norm": 1.020142674446106, "learning_rate": 0.0002, "epoch": 5.441877521085442, "step": 7420}, {"loss": 0.4853, "grad_norm": 1.044754147529602, "learning_rate": 0.0002, "epoch": 5.449211587825449, "step": 7430}, {"loss": 0.5666, "grad_norm": 1.5476195812225342, "learning_rate": 0.0002, "epoch": 5.456545654565456, "step": 7440}, {"loss": 0.5302, "grad_norm": 0.9879506826400757, "learning_rate": 0.0002, "epoch": 5.463879721305464, "step": 7450}, {"loss": 0.591, "grad_norm": 1.2562980651855469, "learning_rate": 0.0002, "epoch": 5.471213788045471, "step": 7460}, {"loss": 0.5188, "grad_norm": 1.3051384687423706, "learning_rate": 0.0002, "epoch": 5.478547854785479, "step": 7470}, {"loss": 0.5658, "grad_norm": 1.0511597394943237, "learning_rate": 0.0002, "epoch": 5.485881921525486, "step": 7480}, {"loss": 0.6327, "grad_norm": 1.0380817651748657, "learning_rate": 0.0002, "epoch": 5.493215988265494, "step": 7490}, {"loss": 0.5356, "grad_norm": 1.170274257659912, "learning_rate": 0.0002, "epoch": 5.5005500550055, "step": 7500}, {"loss": 0.5405, "grad_norm": 1.3356517553329468, "learning_rate": 0.0002, "epoch": 5.507884121745508, "step": 7510}, {"loss": 0.5305, "grad_norm": 1.0727124214172363, "learning_rate": 0.0002, "epoch": 5.515218188485515, "step": 7520}, {"loss": 0.5543, "grad_norm": 1.0110199451446533, "learning_rate": 0.0002, "epoch": 5.522552255225523, "step": 7530}, {"loss": 0.5962, "grad_norm": 1.3086743354797363, "learning_rate": 0.0002, "epoch": 5.52988632196553, "step": 7540}, {"loss": 0.5512, "grad_norm": 1.1904916763305664, "learning_rate": 0.0002, "epoch": 5.537220388705538, "step": 7550}, {"loss": 0.5915, "grad_norm": 0.9466280937194824, "learning_rate": 0.0002, "epoch": 5.544554455445544, "step": 7560}, {"loss": 0.5573, "grad_norm": 1.1237901449203491, "learning_rate": 0.0002, "epoch": 5.551888522185552, "step": 7570}, {"loss": 0.5383, "grad_norm": 0.9590660333633423, "learning_rate": 0.0002, "epoch": 5.559222588925559, "step": 7580}, {"loss": 0.5594, "grad_norm": 1.0890778303146362, "learning_rate": 0.0002, "epoch": 5.566556655665567, "step": 7590}, {"loss": 0.5698, "grad_norm": 0.7206931114196777, "learning_rate": 0.0002, "epoch": 5.573890722405574, "step": 7600}, {"loss": 0.5511, "grad_norm": 1.2884514331817627, "learning_rate": 0.0002, "epoch": 5.5812247891455815, "step": 7610}, {"loss": 0.5279, "grad_norm": 0.7798039317131042, "learning_rate": 0.0002, "epoch": 5.588558855885589, "step": 7620}, {"loss": 0.4847, "grad_norm": 1.166046142578125, "learning_rate": 0.0002, "epoch": 5.595892922625596, "step": 7630}, {"loss": 0.5821, "grad_norm": 1.0150201320648193, "learning_rate": 0.0002, "epoch": 5.603226989365603, "step": 7640}, {"loss": 0.5296, "grad_norm": 1.0449682474136353, "learning_rate": 0.0002, "epoch": 5.6105610561056105, "step": 7650}, {"loss": 0.5431, "grad_norm": 0.9310530424118042, "learning_rate": 0.0002, "epoch": 5.617895122845618, "step": 7660}, {"loss": 0.5234, "grad_norm": 0.9117933511734009, "learning_rate": 0.0002, "epoch": 5.6252291895856255, "step": 7670}, {"loss": 0.5807, "grad_norm": 1.1475164890289307, "learning_rate": 0.0002, "epoch": 5.632563256325633, "step": 7680}, {"loss": 0.5816, "grad_norm": 1.066809058189392, "learning_rate": 0.0002, "epoch": 5.6398973230656395, "step": 7690}, {"loss": 0.551, "grad_norm": 1.2834991216659546, "learning_rate": 0.0002, "epoch": 5.647231389805647, "step": 7700}, {"loss": 0.5914, "grad_norm": 1.2245112657546997, "learning_rate": 0.0002, "epoch": 5.6545654565456545, "step": 7710}, {"loss": 0.5552, "grad_norm": 1.1424106359481812, "learning_rate": 0.0002, "epoch": 5.661899523285662, "step": 7720}, {"loss": 0.559, "grad_norm": 1.0673892498016357, "learning_rate": 0.0002, "epoch": 5.669233590025669, "step": 7730}, {"loss": 0.544, "grad_norm": 1.4312121868133545, "learning_rate": 0.0002, "epoch": 5.676567656765677, "step": 7740}, {"loss": 0.5576, "grad_norm": 0.9976982474327087, "learning_rate": 0.0002, "epoch": 5.683901723505684, "step": 7750}, {"loss": 0.4855, "grad_norm": 0.9464678168296814, "learning_rate": 0.0002, "epoch": 5.691235790245691, "step": 7760}, {"loss": 0.5363, "grad_norm": 1.010995626449585, "learning_rate": 0.0002, "epoch": 5.698569856985698, "step": 7770}, {"loss": 0.5873, "grad_norm": 1.3787750005722046, "learning_rate": 0.0002, "epoch": 5.705903923725706, "step": 7780}, {"loss": 0.6234, "grad_norm": 1.020922303199768, "learning_rate": 0.0002, "epoch": 5.713237990465713, "step": 7790}, {"loss": 0.5337, "grad_norm": 0.9748636484146118, "learning_rate": 0.0002, "epoch": 5.720572057205721, "step": 7800}, {"loss": 0.5507, "grad_norm": 1.3077744245529175, "learning_rate": 0.0002, "epoch": 5.727906123945728, "step": 7810}, {"loss": 0.558, "grad_norm": 1.4770057201385498, "learning_rate": 0.0002, "epoch": 5.735240190685735, "step": 7820}, {"loss": 0.5571, "grad_norm": 1.6349090337753296, "learning_rate": 0.0002, "epoch": 5.742574257425742, "step": 7830}, {"loss": 0.5056, "grad_norm": 0.9818630814552307, "learning_rate": 0.0002, "epoch": 5.74990832416575, "step": 7840}, {"loss": 0.5495, "grad_norm": 0.9659715890884399, "learning_rate": 0.0002, "epoch": 5.757242390905757, "step": 7850}, {"loss": 0.5628, "grad_norm": 0.9269950985908508, "learning_rate": 0.0002, "epoch": 5.764576457645765, "step": 7860}, {"loss": 0.5594, "grad_norm": 1.0099073648452759, "learning_rate": 0.0002, "epoch": 5.771910524385772, "step": 7870}, {"loss": 0.5912, "grad_norm": 0.9123615026473999, "learning_rate": 0.0002, "epoch": 5.77924459112578, "step": 7880}, {"loss": 0.6054, "grad_norm": 1.1542246341705322, "learning_rate": 0.0002, "epoch": 5.786578657865786, "step": 7890}, {"loss": 0.5829, "grad_norm": 1.0792022943496704, "learning_rate": 0.0002, "epoch": 5.793912724605794, "step": 7900}, {"loss": 0.504, "grad_norm": 0.95615553855896, "learning_rate": 0.0002, "epoch": 5.801246791345801, "step": 7910}, {"loss": 0.5918, "grad_norm": 1.2471332550048828, "learning_rate": 0.0002, "epoch": 5.808580858085809, "step": 7920}, {"loss": 0.5719, "grad_norm": 1.0189851522445679, "learning_rate": 0.0002, "epoch": 5.815914924825816, "step": 7930}, {"loss": 0.5958, "grad_norm": 1.3309742212295532, "learning_rate": 0.0002, "epoch": 5.823248991565823, "step": 7940}, {"loss": 0.6255, "grad_norm": 1.2930549383163452, "learning_rate": 0.0002, "epoch": 5.83058305830583, "step": 7950}, {"loss": 0.5301, "grad_norm": 0.8216308951377869, "learning_rate": 0.0002, "epoch": 5.837917125045838, "step": 7960}, {"loss": 0.5397, "grad_norm": 1.1205775737762451, "learning_rate": 0.0002, "epoch": 5.845251191785845, "step": 7970}, {"loss": 0.5903, "grad_norm": 0.851298451423645, "learning_rate": 0.0002, "epoch": 5.852585258525853, "step": 7980}, {"loss": 0.5981, "grad_norm": 0.8797095417976379, "learning_rate": 0.0002, "epoch": 5.85991932526586, "step": 7990}, {"loss": 0.6106, "grad_norm": 1.5784614086151123, "learning_rate": 0.0002, "epoch": 5.867253392005868, "step": 8000}, {"loss": 0.5956, "grad_norm": 1.1531187295913696, "learning_rate": 0.0002, "epoch": 5.874587458745875, "step": 8010}, {"loss": 0.6289, "grad_norm": 1.2469146251678467, "learning_rate": 0.0002, "epoch": 5.881921525485882, "step": 8020}, {"loss": 0.5827, "grad_norm": 1.0784350633621216, "learning_rate": 0.0002, "epoch": 5.889255592225889, "step": 8030}, {"loss": 0.6339, "grad_norm": 1.1311599016189575, "learning_rate": 0.0002, "epoch": 5.896589658965897, "step": 8040}, {"loss": 0.5815, "grad_norm": 0.9654512405395508, "learning_rate": 0.0002, "epoch": 5.903923725705904, "step": 8050}, {"loss": 0.6198, "grad_norm": 1.3288270235061646, "learning_rate": 0.0002, "epoch": 5.9112577924459115, "step": 8060}, {"loss": 0.6515, "grad_norm": 1.12800931930542, "learning_rate": 0.0002, "epoch": 5.918591859185918, "step": 8070}, {"loss": 0.5684, "grad_norm": 0.9449917674064636, "learning_rate": 0.0002, "epoch": 5.925925925925926, "step": 8080}, {"loss": 0.6063, "grad_norm": 1.1532357931137085, "learning_rate": 0.0002, "epoch": 5.933259992665933, "step": 8090}, {"loss": 0.5318, "grad_norm": 1.2211151123046875, "learning_rate": 0.0002, "epoch": 5.9405940594059405, "step": 8100}, {"loss": 0.6512, "grad_norm": 1.3459105491638184, "learning_rate": 0.0002, "epoch": 5.947928126145948, "step": 8110}, {"loss": 0.5952, "grad_norm": 1.251999855041504, "learning_rate": 0.0002, "epoch": 5.9552621928859555, "step": 8120}, {"loss": 0.6203, "grad_norm": 1.5682506561279297, "learning_rate": 0.0002, "epoch": 5.962596259625963, "step": 8130}, {"loss": 0.6253, "grad_norm": 0.926075279712677, "learning_rate": 0.0002, "epoch": 5.9699303263659695, "step": 8140}, {"loss": 0.5545, "grad_norm": 0.9622511863708496, "learning_rate": 0.0002, "epoch": 5.977264393105977, "step": 8150}, {"loss": 0.5518, "grad_norm": 0.9633373618125916, "learning_rate": 0.0002, "epoch": 5.9845984598459845, "step": 8160}, {"loss": 0.5831, "grad_norm": 0.8960476517677307, "learning_rate": 0.0002, "epoch": 5.991932526585992, "step": 8170}, {"loss": 0.5442, "grad_norm": 0.9372805953025818, "learning_rate": 0.0002, "epoch": 5.999266593325999, "step": 8180}, {"eval_loss": 1.3233846426010132, "eval_runtime": 32.7419, "eval_samples_per_second": 13.164, "eval_steps_per_second": 1.649, "epoch": 6.0, "step": 8181}, {"loss": 0.4644, "grad_norm": 1.1900787353515625, "learning_rate": 0.0002, "epoch": 6.006600660066007, "step": 8190}, {"loss": 0.4509, "grad_norm": 1.1448326110839844, "learning_rate": 0.0002, "epoch": 6.013934726806014, "step": 8200}, {"loss": 0.3667, "grad_norm": 1.1848368644714355, "learning_rate": 0.0002, "epoch": 6.021268793546021, "step": 8210}, {"loss": 0.4315, "grad_norm": 1.2315572500228882, "learning_rate": 0.0002, "epoch": 6.028602860286028, "step": 8220}, {"loss": 0.3541, "grad_norm": 1.2214244604110718, "learning_rate": 0.0002, "epoch": 6.035936927026036, "step": 8230}, {"loss": 0.4025, "grad_norm": 0.9455513954162598, "learning_rate": 0.0002, "epoch": 6.043270993766043, "step": 8240}, {"loss": 0.4448, "grad_norm": 0.9574248790740967, "learning_rate": 0.0002, "epoch": 6.050605060506051, "step": 8250}, {"loss": 0.4271, "grad_norm": 1.1022400856018066, "learning_rate": 0.0002, "epoch": 6.057939127246058, "step": 8260}, {"loss": 0.3603, "grad_norm": 0.9555122256278992, "learning_rate": 0.0002, "epoch": 6.065273193986065, "step": 8270}, {"loss": 0.4324, "grad_norm": 1.1956106424331665, "learning_rate": 0.0002, "epoch": 6.072607260726072, "step": 8280}, {"loss": 0.3924, "grad_norm": 1.3110876083374023, "learning_rate": 0.0002, "epoch": 6.07994132746608, "step": 8290}, {"loss": 0.3664, "grad_norm": 1.1293374300003052, "learning_rate": 0.0002, "epoch": 6.087275394206087, "step": 8300}, {"loss": 0.385, "grad_norm": 0.9176164269447327, "learning_rate": 0.0002, "epoch": 6.094609460946095, "step": 8310}, {"loss": 0.4142, "grad_norm": 0.9751231670379639, "learning_rate": 0.0002, "epoch": 6.101943527686102, "step": 8320}, {"loss": 0.4356, "grad_norm": 1.0536044836044312, "learning_rate": 0.0002, "epoch": 6.109277594426109, "step": 8330}, {"loss": 0.409, "grad_norm": 1.289342999458313, "learning_rate": 0.0002, "epoch": 6.116611661166116, "step": 8340}, {"loss": 0.4121, "grad_norm": 1.1773661375045776, "learning_rate": 0.0002, "epoch": 6.123945727906124, "step": 8350}, {"loss": 0.4499, "grad_norm": 1.2450661659240723, "learning_rate": 0.0002, "epoch": 6.131279794646131, "step": 8360}, {"loss": 0.4467, "grad_norm": 1.3965914249420166, "learning_rate": 0.0002, "epoch": 6.138613861386139, "step": 8370}, {"loss": 0.4024, "grad_norm": 1.3530808687210083, "learning_rate": 0.0002, "epoch": 6.145947928126146, "step": 8380}, {"loss": 0.4658, "grad_norm": 1.296276330947876, "learning_rate": 0.0002, "epoch": 6.153281994866154, "step": 8390}, {"loss": 0.5073, "grad_norm": 0.9759053587913513, "learning_rate": 0.0002, "epoch": 6.16061606160616, "step": 8400}, {"loss": 0.4718, "grad_norm": 1.2110707759857178, "learning_rate": 0.0002, "epoch": 6.167950128346168, "step": 8410}, {"loss": 0.4453, "grad_norm": 1.312226414680481, "learning_rate": 0.0002, "epoch": 6.175284195086175, "step": 8420}, {"loss": 0.4183, "grad_norm": 1.1696736812591553, "learning_rate": 0.0002, "epoch": 6.182618261826183, "step": 8430}, {"loss": 0.4546, "grad_norm": 1.260304570198059, "learning_rate": 0.0002, "epoch": 6.18995232856619, "step": 8440}, {"loss": 0.4137, "grad_norm": 1.472961187362671, "learning_rate": 0.0002, "epoch": 6.197286395306198, "step": 8450}, {"loss": 0.42, "grad_norm": 1.3618475198745728, "learning_rate": 0.0002, "epoch": 6.204620462046204, "step": 8460}, {"loss": 0.415, "grad_norm": 1.2544318437576294, "learning_rate": 0.0002, "epoch": 6.211954528786212, "step": 8470}, {"loss": 0.3907, "grad_norm": 1.205898642539978, "learning_rate": 0.0002, "epoch": 6.219288595526219, "step": 8480}, {"loss": 0.4431, "grad_norm": 0.9984724521636963, "learning_rate": 0.0002, "epoch": 6.226622662266227, "step": 8490}, {"loss": 0.4768, "grad_norm": 1.3184109926223755, "learning_rate": 0.0002, "epoch": 6.233956729006234, "step": 8500}, {"loss": 0.3859, "grad_norm": 1.135520100593567, "learning_rate": 0.0002, "epoch": 6.241290795746242, "step": 8510}, {"loss": 0.4159, "grad_norm": 1.4528400897979736, "learning_rate": 0.0002, "epoch": 6.248624862486249, "step": 8520}, {"loss": 0.4347, "grad_norm": 1.1222716569900513, "learning_rate": 0.0002, "epoch": 6.255958929226256, "step": 8530}, {"loss": 0.4581, "grad_norm": 1.7878046035766602, "learning_rate": 0.0002, "epoch": 6.263292995966263, "step": 8540}, {"loss": 0.4298, "grad_norm": 0.9789481163024902, "learning_rate": 0.0002, "epoch": 6.270627062706271, "step": 8550}, {"loss": 0.4316, "grad_norm": 1.151977300643921, "learning_rate": 0.0002, "epoch": 6.277961129446278, "step": 8560}, {"loss": 0.428, "grad_norm": 1.389968752861023, "learning_rate": 0.0002, "epoch": 6.2852951961862855, "step": 8570}, {"loss": 0.3903, "grad_norm": 0.884211003780365, "learning_rate": 0.0002, "epoch": 6.292629262926293, "step": 8580}, {"loss": 0.4611, "grad_norm": 1.3604296445846558, "learning_rate": 0.0002, "epoch": 6.2999633296663, "step": 8590}, {"loss": 0.4183, "grad_norm": 1.1845694780349731, "learning_rate": 0.0002, "epoch": 6.307297396406307, "step": 8600}, {"loss": 0.472, "grad_norm": 1.3231550455093384, "learning_rate": 0.0002, "epoch": 6.3146314631463145, "step": 8610}, {"loss": 0.3922, "grad_norm": 0.9546721577644348, "learning_rate": 0.0002, "epoch": 6.321965529886322, "step": 8620}, {"loss": 0.4395, "grad_norm": 1.2329787015914917, "learning_rate": 0.0002, "epoch": 6.3292995966263295, "step": 8630}, {"loss": 0.4344, "grad_norm": 1.0240199565887451, "learning_rate": 0.0002, "epoch": 6.336633663366337, "step": 8640}, {"loss": 0.4529, "grad_norm": 1.1866962909698486, "learning_rate": 0.0002, "epoch": 6.343967730106344, "step": 8650}, {"loss": 0.4575, "grad_norm": 1.2819687128067017, "learning_rate": 0.0002, "epoch": 6.351301796846351, "step": 8660}, {"loss": 0.455, "grad_norm": 0.9654944539070129, "learning_rate": 0.0002, "epoch": 6.3586358635863585, "step": 8670}, {"loss": 0.4739, "grad_norm": 0.9443874955177307, "learning_rate": 0.0002, "epoch": 6.365969930326366, "step": 8680}, {"loss": 0.435, "grad_norm": 1.2914115190505981, "learning_rate": 0.0002, "epoch": 6.373303997066373, "step": 8690}, {"loss": 0.4392, "grad_norm": 1.4558709859848022, "learning_rate": 0.0002, "epoch": 6.380638063806381, "step": 8700}, {"loss": 0.4398, "grad_norm": 1.3255952596664429, "learning_rate": 0.0002, "epoch": 6.387972130546388, "step": 8710}, {"loss": 0.4451, "grad_norm": 1.348742961883545, "learning_rate": 0.0002, "epoch": 6.395306197286395, "step": 8720}, {"loss": 0.41, "grad_norm": 1.0096025466918945, "learning_rate": 0.0002, "epoch": 6.402640264026402, "step": 8730}, {"loss": 0.4459, "grad_norm": 1.1720590591430664, "learning_rate": 0.0002, "epoch": 6.40997433076641, "step": 8740}, {"loss": 0.5059, "grad_norm": 1.1803077459335327, "learning_rate": 0.0002, "epoch": 6.417308397506417, "step": 8750}, {"loss": 0.4539, "grad_norm": 1.3649998903274536, "learning_rate": 0.0002, "epoch": 6.424642464246425, "step": 8760}, {"loss": 0.4171, "grad_norm": 1.1503992080688477, "learning_rate": 0.0002, "epoch": 6.431976530986432, "step": 8770}, {"loss": 0.488, "grad_norm": 1.1537176370620728, "learning_rate": 0.0002, "epoch": 6.43931059772644, "step": 8780}, {"loss": 0.4167, "grad_norm": 0.9743003845214844, "learning_rate": 0.0002, "epoch": 6.446644664466446, "step": 8790}, {"loss": 0.4813, "grad_norm": 0.9097744822502136, "learning_rate": 0.0002, "epoch": 6.453978731206454, "step": 8800}, {"loss": 0.4809, "grad_norm": 2.0174002647399902, "learning_rate": 0.0002, "epoch": 6.461312797946461, "step": 8810}, {"loss": 0.4879, "grad_norm": 1.0809309482574463, "learning_rate": 0.0002, "epoch": 6.468646864686469, "step": 8820}, {"loss": 0.4235, "grad_norm": 1.100294828414917, "learning_rate": 0.0002, "epoch": 6.475980931426476, "step": 8830}, {"loss": 0.4251, "grad_norm": 1.3707489967346191, "learning_rate": 0.0002, "epoch": 6.483314998166484, "step": 8840}, {"loss": 0.4533, "grad_norm": 1.1304761171340942, "learning_rate": 0.0002, "epoch": 6.49064906490649, "step": 8850}, {"loss": 0.4596, "grad_norm": 1.2171573638916016, "learning_rate": 0.0002, "epoch": 6.497983131646498, "step": 8860}, {"loss": 0.4694, "grad_norm": 1.0452901124954224, "learning_rate": 0.0002, "epoch": 6.505317198386505, "step": 8870}, {"loss": 0.4855, "grad_norm": 1.197298526763916, "learning_rate": 0.0002, "epoch": 6.512651265126513, "step": 8880}, {"loss": 0.4167, "grad_norm": 0.9179880619049072, "learning_rate": 0.0002, "epoch": 6.51998533186652, "step": 8890}, {"loss": 0.445, "grad_norm": 1.415079951286316, "learning_rate": 0.0002, "epoch": 6.527319398606528, "step": 8900}, {"loss": 0.424, "grad_norm": 1.1032487154006958, "learning_rate": 0.0002, "epoch": 6.534653465346535, "step": 8910}, {"loss": 0.4496, "grad_norm": 1.2295007705688477, "learning_rate": 0.0002, "epoch": 6.541987532086542, "step": 8920}, {"loss": 0.4755, "grad_norm": 1.4223219156265259, "learning_rate": 0.0002, "epoch": 6.549321598826549, "step": 8930}, {"loss": 0.4597, "grad_norm": 1.2785786390304565, "learning_rate": 0.0002, "epoch": 6.556655665566557, "step": 8940}, {"loss": 0.4651, "grad_norm": 1.3514775037765503, "learning_rate": 0.0002, "epoch": 6.563989732306564, "step": 8950}, {"loss": 0.4961, "grad_norm": 1.107937216758728, "learning_rate": 0.0002, "epoch": 6.571323799046572, "step": 8960}, {"loss": 0.4954, "grad_norm": 1.2839902639389038, "learning_rate": 0.0002, "epoch": 6.578657865786578, "step": 8970}, {"loss": 0.4207, "grad_norm": 0.9793244004249573, "learning_rate": 0.0002, "epoch": 6.585991932526586, "step": 8980}, {"loss": 0.4989, "grad_norm": 1.3403126001358032, "learning_rate": 0.0002, "epoch": 6.593325999266593, "step": 8990}, {"loss": 0.465, "grad_norm": 1.2612813711166382, "learning_rate": 0.0002, "epoch": 6.600660066006601, "step": 9000}, {"loss": 0.4589, "grad_norm": 1.4347625970840454, "learning_rate": 0.0002, "epoch": 6.607994132746608, "step": 9010}, {"loss": 0.4864, "grad_norm": 1.225921869277954, "learning_rate": 0.0002, "epoch": 6.6153281994866155, "step": 9020}, {"loss": 0.4364, "grad_norm": 1.033644676208496, "learning_rate": 0.0002, "epoch": 6.622662266226623, "step": 9030}, {"loss": 0.4698, "grad_norm": 1.1791894435882568, "learning_rate": 0.0002, "epoch": 6.6299963329666305, "step": 9040}, {"loss": 0.4908, "grad_norm": 1.0968137979507446, "learning_rate": 0.0002, "epoch": 6.637330399706637, "step": 9050}, {"loss": 0.4346, "grad_norm": 1.5639140605926514, "learning_rate": 0.0002, "epoch": 6.6446644664466445, "step": 9060}, {"loss": 0.4627, "grad_norm": 1.4158905744552612, "learning_rate": 0.0002, "epoch": 6.651998533186652, "step": 9070}, {"loss": 0.4619, "grad_norm": 1.2120254039764404, "learning_rate": 0.0002, "epoch": 6.6593325999266595, "step": 9080}, {"loss": 0.4564, "grad_norm": 1.1866531372070312, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 9090}, {"loss": 0.5175, "grad_norm": 1.2704026699066162, "learning_rate": 0.0002, "epoch": 6.6740007334066735, "step": 9100}, {"loss": 0.4859, "grad_norm": 1.1878353357315063, "learning_rate": 0.0002, "epoch": 6.681334800146681, "step": 9110}, {"loss": 0.4657, "grad_norm": 1.193995475769043, "learning_rate": 0.0002, "epoch": 6.6886688668866885, "step": 9120}, {"loss": 0.4939, "grad_norm": 1.2927545309066772, "learning_rate": 0.0002, "epoch": 6.696002933626696, "step": 9130}, {"loss": 0.4157, "grad_norm": 1.0770703554153442, "learning_rate": 0.0002, "epoch": 6.703337000366703, "step": 9140}, {"loss": 0.4571, "grad_norm": 1.2200851440429688, "learning_rate": 0.0002, "epoch": 6.710671067106711, "step": 9150}, {"loss": 0.4605, "grad_norm": 1.293891191482544, "learning_rate": 0.0002, "epoch": 6.718005133846718, "step": 9160}, {"loss": 0.5328, "grad_norm": 1.9376052618026733, "learning_rate": 0.0002, "epoch": 6.725339200586725, "step": 9170}, {"loss": 0.4861, "grad_norm": 1.0353254079818726, "learning_rate": 0.0002, "epoch": 6.732673267326732, "step": 9180}, {"loss": 0.5668, "grad_norm": 1.1274057626724243, "learning_rate": 0.0002, "epoch": 6.74000733406674, "step": 9190}, {"loss": 0.4486, "grad_norm": 1.3344064950942993, "learning_rate": 0.0002, "epoch": 6.747341400806747, "step": 9200}, {"loss": 0.49, "grad_norm": 1.303621768951416, "learning_rate": 0.0002, "epoch": 6.754675467546755, "step": 9210}, {"loss": 0.5059, "grad_norm": 1.2327780723571777, "learning_rate": 0.0002, "epoch": 6.762009534286762, "step": 9220}, {"loss": 0.486, "grad_norm": 1.3513109683990479, "learning_rate": 0.0002, "epoch": 6.769343601026769, "step": 9230}, {"loss": 0.5254, "grad_norm": 1.4762850999832153, "learning_rate": 0.0002, "epoch": 6.776677667766776, "step": 9240}, {"loss": 0.4181, "grad_norm": 1.0967189073562622, "learning_rate": 0.0002, "epoch": 6.784011734506784, "step": 9250}, {"loss": 0.4862, "grad_norm": 0.933936357498169, "learning_rate": 0.0002, "epoch": 6.791345801246791, "step": 9260}, {"loss": 0.4667, "grad_norm": 1.065553903579712, "learning_rate": 0.0002, "epoch": 6.798679867986799, "step": 9270}, {"loss": 0.5164, "grad_norm": 1.2044163942337036, "learning_rate": 0.0002, "epoch": 6.806013934726806, "step": 9280}, {"loss": 0.4648, "grad_norm": 1.404137134552002, "learning_rate": 0.0002, "epoch": 6.813348001466814, "step": 9290}, {"loss": 0.4442, "grad_norm": 1.4005582332611084, "learning_rate": 0.0002, "epoch": 6.82068206820682, "step": 9300}, {"loss": 0.459, "grad_norm": 1.1771104335784912, "learning_rate": 0.0002, "epoch": 6.828016134946828, "step": 9310}, {"loss": 0.5059, "grad_norm": 1.191933035850525, "learning_rate": 0.0002, "epoch": 6.835350201686835, "step": 9320}, {"loss": 0.4733, "grad_norm": 1.3395432233810425, "learning_rate": 0.0002, "epoch": 6.842684268426843, "step": 9330}, {"loss": 0.4882, "grad_norm": 1.4145503044128418, "learning_rate": 0.0002, "epoch": 6.85001833516685, "step": 9340}, {"loss": 0.4872, "grad_norm": 1.1128839254379272, "learning_rate": 0.0002, "epoch": 6.857352401906858, "step": 9350}, {"loss": 0.4909, "grad_norm": 1.0771174430847168, "learning_rate": 0.0002, "epoch": 6.864686468646864, "step": 9360}, {"loss": 0.4739, "grad_norm": 1.1089814901351929, "learning_rate": 0.0002, "epoch": 6.872020535386872, "step": 9370}, {"loss": 0.4854, "grad_norm": 1.078444004058838, "learning_rate": 0.0002, "epoch": 6.879354602126879, "step": 9380}, {"loss": 0.4904, "grad_norm": 1.3676636219024658, "learning_rate": 0.0002, "epoch": 6.886688668866887, "step": 9390}, {"loss": 0.4854, "grad_norm": 0.8973749876022339, "learning_rate": 0.0002, "epoch": 6.894022735606894, "step": 9400}, {"loss": 0.4274, "grad_norm": 1.141552448272705, "learning_rate": 0.0002, "epoch": 6.901356802346902, "step": 9410}, {"loss": 0.4972, "grad_norm": 0.8345359563827515, "learning_rate": 0.0002, "epoch": 6.908690869086909, "step": 9420}, {"loss": 0.5218, "grad_norm": 1.1602197885513306, "learning_rate": 0.0002, "epoch": 6.916024935826916, "step": 9430}, {"loss": 0.4911, "grad_norm": 1.275466799736023, "learning_rate": 0.0002, "epoch": 6.923359002566923, "step": 9440}, {"loss": 0.4904, "grad_norm": 0.9186071157455444, "learning_rate": 0.0002, "epoch": 6.930693069306931, "step": 9450}, {"loss": 0.4604, "grad_norm": 0.9069198966026306, "learning_rate": 0.0002, "epoch": 6.938027136046938, "step": 9460}, {"loss": 0.4363, "grad_norm": 1.2331899404525757, "learning_rate": 0.0002, "epoch": 6.945361202786946, "step": 9470}, {"loss": 0.4815, "grad_norm": 0.8685150742530823, "learning_rate": 0.0002, "epoch": 6.952695269526953, "step": 9480}, {"loss": 0.4424, "grad_norm": 1.4067939519882202, "learning_rate": 0.0002, "epoch": 6.96002933626696, "step": 9490}, {"loss": 0.5089, "grad_norm": 1.1864029169082642, "learning_rate": 0.0002, "epoch": 6.967363403006967, "step": 9500}, {"loss": 0.4906, "grad_norm": 1.3697725534439087, "learning_rate": 0.0002, "epoch": 6.974697469746975, "step": 9510}, {"loss": 0.4797, "grad_norm": 1.1632893085479736, "learning_rate": 0.0002, "epoch": 6.982031536486982, "step": 9520}, {"loss": 0.4526, "grad_norm": 1.1447268724441528, "learning_rate": 0.0002, "epoch": 6.9893656032269895, "step": 9530}, {"loss": 0.4627, "grad_norm": 1.5017213821411133, "learning_rate": 0.0002, "epoch": 6.996699669966997, "step": 9540}]} +{"epoch": 7.997066373303997, "step": 10904, "epoch_duration": 1482.7493734359741, "total_accumulated_duration": 11885.982197284698, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-4/checkpoint-2727", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.9722, "grad_norm": 0.47521963715553284, "learning_rate": 0.0002, "epoch": 0.007334066740007334, "step": 10}, {"loss": 1.4821, "grad_norm": 0.5395162105560303, "learning_rate": 0.0002, "epoch": 0.014668133480014669, "step": 20}, {"loss": 1.4202, "grad_norm": 0.4305780231952667, "learning_rate": 0.0002, "epoch": 0.022002200220022004, "step": 30}, {"loss": 1.4271, "grad_norm": 0.6938246488571167, "learning_rate": 0.0002, "epoch": 0.029336266960029337, "step": 40}, {"loss": 1.3112, "grad_norm": 1.5133819580078125, "learning_rate": 0.0002, "epoch": 0.03667033370003667, "step": 50}, {"loss": 1.3132, "grad_norm": 0.9173883199691772, "learning_rate": 0.0002, "epoch": 0.04400440044004401, "step": 60}, {"loss": 1.2844, "grad_norm": 0.4619861841201782, "learning_rate": 0.0002, "epoch": 0.05133846718005134, "step": 70}, {"loss": 1.2108, "grad_norm": 0.46118637919425964, "learning_rate": 0.0002, "epoch": 0.058672533920058674, "step": 80}, {"loss": 1.3441, "grad_norm": 0.4468648135662079, "learning_rate": 0.0002, "epoch": 0.066006600660066, "step": 90}, {"loss": 1.1863, "grad_norm": 0.46123769879341125, "learning_rate": 0.0002, "epoch": 0.07334066740007333, "step": 100}, {"loss": 1.2772, "grad_norm": 0.4859139025211334, "learning_rate": 0.0002, "epoch": 0.08067473414008068, "step": 110}, {"loss": 1.2087, "grad_norm": 0.4384922385215759, "learning_rate": 0.0002, "epoch": 0.08800880088008801, "step": 120}, {"loss": 1.2927, "grad_norm": 0.39519360661506653, "learning_rate": 0.0002, "epoch": 0.09534286762009535, "step": 130}, {"loss": 1.2349, "grad_norm": 0.4049859344959259, "learning_rate": 0.0002, "epoch": 0.10267693436010268, "step": 140}, {"loss": 1.293, "grad_norm": 0.4605638086795807, "learning_rate": 0.0002, "epoch": 0.11001100110011001, "step": 150}, {"loss": 1.2659, "grad_norm": 0.4201928377151489, "learning_rate": 0.0002, "epoch": 0.11734506784011735, "step": 160}, {"loss": 1.3961, "grad_norm": 0.5367777347564697, "learning_rate": 0.0002, "epoch": 0.12467913458012468, "step": 170}, {"loss": 1.2481, "grad_norm": 0.41752299666404724, "learning_rate": 0.0002, "epoch": 0.132013201320132, "step": 180}, {"loss": 1.207, "grad_norm": 0.31597763299942017, "learning_rate": 0.0002, "epoch": 0.13934726806013933, "step": 190}, {"loss": 1.2441, "grad_norm": 0.7468788623809814, "learning_rate": 0.0002, "epoch": 0.14668133480014667, "step": 200}, {"loss": 1.199, "grad_norm": 0.3403034508228302, "learning_rate": 0.0002, "epoch": 0.15401540154015403, "step": 210}, {"loss": 1.2439, "grad_norm": 0.34240293502807617, "learning_rate": 0.0002, "epoch": 0.16134946828016136, "step": 220}, {"loss": 1.2022, "grad_norm": 0.356158971786499, "learning_rate": 0.0002, "epoch": 0.1686835350201687, "step": 230}, {"loss": 1.207, "grad_norm": 0.3448857367038727, "learning_rate": 0.0002, "epoch": 0.17601760176017603, "step": 240}, {"loss": 1.2156, "grad_norm": 0.3475699722766876, "learning_rate": 0.0002, "epoch": 0.18335166850018336, "step": 250}, {"loss": 1.1551, "grad_norm": 0.2770358622074127, "learning_rate": 0.0002, "epoch": 0.1906857352401907, "step": 260}, {"loss": 1.2238, "grad_norm": 0.4310270845890045, "learning_rate": 0.0002, "epoch": 0.19801980198019803, "step": 270}, {"loss": 1.2917, "grad_norm": 0.335041880607605, "learning_rate": 0.0002, "epoch": 0.20535386872020536, "step": 280}, {"loss": 1.0959, "grad_norm": 0.3420602083206177, "learning_rate": 0.0002, "epoch": 0.2126879354602127, "step": 290}, {"loss": 1.1232, "grad_norm": 0.325001060962677, "learning_rate": 0.0002, "epoch": 0.22002200220022003, "step": 300}, {"loss": 1.2007, "grad_norm": 0.3027827739715576, "learning_rate": 0.0002, "epoch": 0.22735606894022736, "step": 310}, {"loss": 1.1803, "grad_norm": 0.435550719499588, "learning_rate": 0.0002, "epoch": 0.2346901356802347, "step": 320}, {"loss": 1.2045, "grad_norm": 0.3884522616863251, "learning_rate": 0.0002, "epoch": 0.24202420242024203, "step": 330}, {"loss": 1.2481, "grad_norm": 0.7736002206802368, "learning_rate": 0.0002, "epoch": 0.24935826916024936, "step": 340}, {"loss": 1.3606, "grad_norm": 0.35052821040153503, "learning_rate": 0.0002, "epoch": 0.2566923359002567, "step": 350}, {"loss": 1.2129, "grad_norm": 0.3311890959739685, "learning_rate": 0.0002, "epoch": 0.264026402640264, "step": 360}, {"loss": 1.2219, "grad_norm": 0.7473500370979309, "learning_rate": 0.0002, "epoch": 0.27136046938027136, "step": 370}, {"loss": 1.2712, "grad_norm": 0.3681875765323639, "learning_rate": 0.0002, "epoch": 0.27869453612027867, "step": 380}, {"loss": 1.2258, "grad_norm": 0.3764737844467163, "learning_rate": 0.0002, "epoch": 0.28602860286028603, "step": 390}, {"loss": 1.1917, "grad_norm": 0.4243989586830139, "learning_rate": 0.0002, "epoch": 0.29336266960029334, "step": 400}, {"loss": 1.199, "grad_norm": 0.2658531963825226, "learning_rate": 0.0002, "epoch": 0.3006967363403007, "step": 410}, {"loss": 1.1622, "grad_norm": 0.3436793386936188, "learning_rate": 0.0002, "epoch": 0.30803080308030806, "step": 420}, {"loss": 1.2953, "grad_norm": 0.5101129412651062, "learning_rate": 0.0002, "epoch": 0.31536486982031536, "step": 430}, {"loss": 1.1557, "grad_norm": 0.3319750726222992, "learning_rate": 0.0002, "epoch": 0.3226989365603227, "step": 440}, {"loss": 1.1804, "grad_norm": 0.385148286819458, "learning_rate": 0.0002, "epoch": 0.33003300330033003, "step": 450}, {"loss": 1.1808, "grad_norm": 0.3477935791015625, "learning_rate": 0.0002, "epoch": 0.3373670700403374, "step": 460}, {"loss": 1.1877, "grad_norm": 0.29748716950416565, "learning_rate": 0.0002, "epoch": 0.3447011367803447, "step": 470}, {"loss": 1.19, "grad_norm": 0.34083324670791626, "learning_rate": 0.0002, "epoch": 0.35203520352035206, "step": 480}, {"loss": 1.2, "grad_norm": 0.36904552578926086, "learning_rate": 0.0002, "epoch": 0.35936927026035936, "step": 490}, {"loss": 1.2223, "grad_norm": 0.315483033657074, "learning_rate": 0.0002, "epoch": 0.3667033370003667, "step": 500}, {"loss": 1.1461, "grad_norm": 0.44897955656051636, "learning_rate": 0.0002, "epoch": 0.37403740374037403, "step": 510}, {"loss": 1.3035, "grad_norm": 0.3160701394081116, "learning_rate": 0.0002, "epoch": 0.3813714704803814, "step": 520}, {"loss": 1.3197, "grad_norm": 0.29584741592407227, "learning_rate": 0.0002, "epoch": 0.3887055372203887, "step": 530}, {"loss": 1.2983, "grad_norm": 0.5430002808570862, "learning_rate": 0.0002, "epoch": 0.39603960396039606, "step": 540}, {"loss": 1.2459, "grad_norm": 0.2908070683479309, "learning_rate": 0.0002, "epoch": 0.40337367070040336, "step": 550}, {"loss": 1.2384, "grad_norm": 0.35066530108451843, "learning_rate": 0.0002, "epoch": 0.4107077374404107, "step": 560}, {"loss": 1.1784, "grad_norm": 0.37588003277778625, "learning_rate": 0.0002, "epoch": 0.41804180418041803, "step": 570}, {"loss": 1.2334, "grad_norm": 0.3112126886844635, "learning_rate": 0.0002, "epoch": 0.4253758709204254, "step": 580}, {"loss": 1.1439, "grad_norm": 0.35577139258384705, "learning_rate": 0.0002, "epoch": 0.4327099376604327, "step": 590}, {"loss": 1.184, "grad_norm": 0.31706422567367554, "learning_rate": 0.0002, "epoch": 0.44004400440044006, "step": 600}, {"loss": 1.2081, "grad_norm": 0.3249092102050781, "learning_rate": 0.0002, "epoch": 0.44737807114044736, "step": 610}, {"loss": 1.0824, "grad_norm": 0.3842705488204956, "learning_rate": 0.0002, "epoch": 0.4547121378804547, "step": 620}, {"loss": 1.2257, "grad_norm": 0.390991747379303, "learning_rate": 0.0002, "epoch": 0.46204620462046203, "step": 630}, {"loss": 1.1954, "grad_norm": 0.27532413601875305, "learning_rate": 0.0002, "epoch": 0.4693802713604694, "step": 640}, {"loss": 1.1058, "grad_norm": 0.31412816047668457, "learning_rate": 0.0002, "epoch": 0.4767143381004767, "step": 650}, {"loss": 1.1312, "grad_norm": 0.32117101550102234, "learning_rate": 0.0002, "epoch": 0.48404840484048406, "step": 660}, {"loss": 1.2423, "grad_norm": 0.3810010254383087, "learning_rate": 0.0002, "epoch": 0.49138247158049136, "step": 670}, {"loss": 1.1978, "grad_norm": 0.36289164423942566, "learning_rate": 0.0002, "epoch": 0.4987165383204987, "step": 680}, {"loss": 1.2034, "grad_norm": 0.34458720684051514, "learning_rate": 0.0002, "epoch": 0.506050605060506, "step": 690}, {"loss": 1.1756, "grad_norm": 0.32844600081443787, "learning_rate": 0.0002, "epoch": 0.5133846718005134, "step": 700}, {"loss": 1.0807, "grad_norm": 0.3144175708293915, "learning_rate": 0.0002, "epoch": 0.5207187385405208, "step": 710}, {"loss": 1.1952, "grad_norm": 0.3898887634277344, "learning_rate": 0.0002, "epoch": 0.528052805280528, "step": 720}, {"loss": 1.1244, "grad_norm": 1.3220758438110352, "learning_rate": 0.0002, "epoch": 0.5353868720205354, "step": 730}, {"loss": 1.227, "grad_norm": 0.3635874390602112, "learning_rate": 0.0002, "epoch": 0.5427209387605427, "step": 740}, {"loss": 1.2169, "grad_norm": 0.3138217628002167, "learning_rate": 0.0002, "epoch": 0.5500550055005501, "step": 750}, {"loss": 1.1516, "grad_norm": 0.4063207805156708, "learning_rate": 0.0002, "epoch": 0.5573890722405573, "step": 760}, {"loss": 1.1954, "grad_norm": 0.3926219940185547, "learning_rate": 0.0002, "epoch": 0.5647231389805647, "step": 770}, {"loss": 1.1726, "grad_norm": 0.31954652070999146, "learning_rate": 0.0002, "epoch": 0.5720572057205721, "step": 780}, {"loss": 1.2977, "grad_norm": 0.4248711168766022, "learning_rate": 0.0002, "epoch": 0.5793912724605794, "step": 790}, {"loss": 1.1728, "grad_norm": 0.643004834651947, "learning_rate": 0.0002, "epoch": 0.5867253392005867, "step": 800}, {"loss": 1.1793, "grad_norm": 0.3479592800140381, "learning_rate": 0.0002, "epoch": 0.594059405940594, "step": 810}, {"loss": 1.2426, "grad_norm": 0.4684754014015198, "learning_rate": 0.0002, "epoch": 0.6013934726806014, "step": 820}, {"loss": 1.2002, "grad_norm": 0.3739790916442871, "learning_rate": 0.0002, "epoch": 0.6087275394206088, "step": 830}, {"loss": 1.2139, "grad_norm": 0.40884748101234436, "learning_rate": 0.0002, "epoch": 0.6160616061606161, "step": 840}, {"loss": 1.1557, "grad_norm": 0.9722164273262024, "learning_rate": 0.0002, "epoch": 0.6233956729006234, "step": 850}, {"loss": 1.3069, "grad_norm": 0.42992347478866577, "learning_rate": 0.0002, "epoch": 0.6307297396406307, "step": 860}, {"loss": 1.1339, "grad_norm": 0.36654195189476013, "learning_rate": 0.0002, "epoch": 0.6380638063806381, "step": 870}, {"loss": 1.1932, "grad_norm": 0.4113832116127014, "learning_rate": 0.0002, "epoch": 0.6453978731206454, "step": 880}, {"loss": 1.2163, "grad_norm": 0.2948838770389557, "learning_rate": 0.0002, "epoch": 0.6527319398606527, "step": 890}, {"loss": 1.1081, "grad_norm": 0.38330280780792236, "learning_rate": 0.0002, "epoch": 0.6600660066006601, "step": 900}, {"loss": 1.1342, "grad_norm": 0.4428867697715759, "learning_rate": 0.0002, "epoch": 0.6674000733406674, "step": 910}, {"loss": 1.1021, "grad_norm": 0.23659265041351318, "learning_rate": 0.0002, "epoch": 0.6747341400806748, "step": 920}, {"loss": 1.1226, "grad_norm": 0.323685884475708, "learning_rate": 0.0002, "epoch": 0.682068206820682, "step": 930}, {"loss": 1.0853, "grad_norm": 0.39157727360725403, "learning_rate": 0.0002, "epoch": 0.6894022735606894, "step": 940}, {"loss": 1.1435, "grad_norm": 0.27189481258392334, "learning_rate": 0.0002, "epoch": 0.6967363403006968, "step": 950}, {"loss": 1.1033, "grad_norm": 0.529883861541748, "learning_rate": 0.0002, "epoch": 0.7040704070407041, "step": 960}, {"loss": 1.139, "grad_norm": 0.34758689999580383, "learning_rate": 0.0002, "epoch": 0.7114044737807114, "step": 970}, {"loss": 1.2197, "grad_norm": 0.831749439239502, "learning_rate": 0.0002, "epoch": 0.7187385405207187, "step": 980}, {"loss": 1.158, "grad_norm": 0.4438304007053375, "learning_rate": 0.0002, "epoch": 0.7260726072607261, "step": 990}, {"loss": 1.1021, "grad_norm": 0.33840006589889526, "learning_rate": 0.0002, "epoch": 0.7334066740007334, "step": 1000}, {"loss": 1.254, "grad_norm": 0.3454797863960266, "learning_rate": 0.0002, "epoch": 0.7407407407407407, "step": 1010}, {"loss": 1.106, "grad_norm": 0.38999441266059875, "learning_rate": 0.0002, "epoch": 0.7480748074807481, "step": 1020}, {"loss": 1.1428, "grad_norm": 0.2829911708831787, "learning_rate": 0.0002, "epoch": 0.7554088742207554, "step": 1030}, {"loss": 1.2123, "grad_norm": 0.36918163299560547, "learning_rate": 0.0002, "epoch": 0.7627429409607628, "step": 1040}, {"loss": 1.3028, "grad_norm": 0.3415680229663849, "learning_rate": 0.0002, "epoch": 0.77007700770077, "step": 1050}, {"loss": 1.1939, "grad_norm": 0.2974182963371277, "learning_rate": 0.0002, "epoch": 0.7774110744407774, "step": 1060}, {"loss": 1.194, "grad_norm": 0.3880919814109802, "learning_rate": 0.0002, "epoch": 0.7847451411807848, "step": 1070}, {"loss": 1.1095, "grad_norm": 0.33503302931785583, "learning_rate": 0.0002, "epoch": 0.7920792079207921, "step": 1080}, {"loss": 1.2111, "grad_norm": 0.3728407025337219, "learning_rate": 0.0002, "epoch": 0.7994132746607994, "step": 1090}, {"loss": 1.0835, "grad_norm": 0.3509373664855957, "learning_rate": 0.0002, "epoch": 0.8067473414008067, "step": 1100}, {"loss": 1.2661, "grad_norm": 0.42228564620018005, "learning_rate": 0.0002, "epoch": 0.8140814081408141, "step": 1110}, {"loss": 1.1788, "grad_norm": 0.313467800617218, "learning_rate": 0.0002, "epoch": 0.8214154748808215, "step": 1120}, {"loss": 1.1971, "grad_norm": 0.3378850817680359, "learning_rate": 0.0002, "epoch": 0.8287495416208287, "step": 1130}, {"loss": 1.1238, "grad_norm": 0.43200382590293884, "learning_rate": 0.0002, "epoch": 0.8360836083608361, "step": 1140}, {"loss": 1.3203, "grad_norm": 0.3309599459171295, "learning_rate": 0.0002, "epoch": 0.8434176751008434, "step": 1150}, {"loss": 1.1062, "grad_norm": 0.3526846170425415, "learning_rate": 0.0002, "epoch": 0.8507517418408508, "step": 1160}, {"loss": 1.0851, "grad_norm": 1.2722247838974, "learning_rate": 0.0002, "epoch": 0.858085808580858, "step": 1170}, {"loss": 1.0785, "grad_norm": 0.34142059087753296, "learning_rate": 0.0002, "epoch": 0.8654198753208654, "step": 1180}, {"loss": 1.2187, "grad_norm": 0.3805823028087616, "learning_rate": 0.0002, "epoch": 0.8727539420608728, "step": 1190}, {"loss": 1.1215, "grad_norm": 0.3931232690811157, "learning_rate": 0.0002, "epoch": 0.8800880088008801, "step": 1200}, {"loss": 1.0948, "grad_norm": 0.2937372624874115, "learning_rate": 0.0002, "epoch": 0.8874220755408874, "step": 1210}, {"loss": 1.1228, "grad_norm": 0.3757196366786957, "learning_rate": 0.0002, "epoch": 0.8947561422808947, "step": 1220}, {"loss": 1.1222, "grad_norm": 0.3502705991268158, "learning_rate": 0.0002, "epoch": 0.9020902090209021, "step": 1230}, {"loss": 1.2242, "grad_norm": 0.32758915424346924, "learning_rate": 0.0002, "epoch": 0.9094242757609095, "step": 1240}, {"loss": 1.215, "grad_norm": 0.37199416756629944, "learning_rate": 0.0002, "epoch": 0.9167583425009168, "step": 1250}, {"loss": 1.1225, "grad_norm": 0.3551490604877472, "learning_rate": 0.0002, "epoch": 0.9240924092409241, "step": 1260}, {"loss": 1.1966, "grad_norm": 0.2859550714492798, "learning_rate": 0.0002, "epoch": 0.9314264759809314, "step": 1270}, {"loss": 1.2186, "grad_norm": 0.427990585565567, "learning_rate": 0.0002, "epoch": 0.9387605427209388, "step": 1280}, {"loss": 1.2848, "grad_norm": 0.33717992901802063, "learning_rate": 0.0002, "epoch": 0.9460946094609461, "step": 1290}, {"loss": 1.1656, "grad_norm": 0.30225634574890137, "learning_rate": 0.0002, "epoch": 0.9534286762009534, "step": 1300}, {"loss": 1.2404, "grad_norm": 0.385821133852005, "learning_rate": 0.0002, "epoch": 0.9607627429409608, "step": 1310}, {"loss": 1.1932, "grad_norm": 0.35278066992759705, "learning_rate": 0.0002, "epoch": 0.9680968096809681, "step": 1320}, {"loss": 1.1071, "grad_norm": 0.49987098574638367, "learning_rate": 0.0002, "epoch": 0.9754308764209755, "step": 1330}, {"loss": 1.2259, "grad_norm": 0.3842747211456299, "learning_rate": 0.0002, "epoch": 0.9827649431609827, "step": 1340}, {"loss": 1.0862, "grad_norm": 0.6274653673171997, "learning_rate": 0.0002, "epoch": 0.9900990099009901, "step": 1350}, {"loss": 1.124, "grad_norm": 0.5239808559417725, "learning_rate": 0.0002, "epoch": 0.9974330766409975, "step": 1360}, {"eval_loss": 1.1822267770767212, "eval_runtime": 32.7389, "eval_samples_per_second": 13.165, "eval_steps_per_second": 1.649, "epoch": 0.9996332966629996, "step": 1363}, {"loss": 1.096, "grad_norm": 0.45311301946640015, "learning_rate": 0.0002, "epoch": 1.0047671433810048, "step": 1370}, {"loss": 1.0143, "grad_norm": 0.29685574769973755, "learning_rate": 0.0002, "epoch": 1.012101210121012, "step": 1380}, {"loss": 1.0302, "grad_norm": 0.3290937840938568, "learning_rate": 0.0002, "epoch": 1.0194352768610195, "step": 1390}, {"loss": 1.0295, "grad_norm": 0.3801758587360382, "learning_rate": 0.0002, "epoch": 1.0267693436010268, "step": 1400}, {"loss": 1.1226, "grad_norm": 0.794174313545227, "learning_rate": 0.0002, "epoch": 1.034103410341034, "step": 1410}, {"loss": 1.2232, "grad_norm": 0.3854154646396637, "learning_rate": 0.0002, "epoch": 1.0414374770810415, "step": 1420}, {"loss": 1.0652, "grad_norm": 0.32702451944351196, "learning_rate": 0.0002, "epoch": 1.0487715438210488, "step": 1430}, {"loss": 1.1144, "grad_norm": 0.7815203666687012, "learning_rate": 0.0002, "epoch": 1.056105610561056, "step": 1440}, {"loss": 1.1316, "grad_norm": 0.3087436854839325, "learning_rate": 0.0002, "epoch": 1.0634396773010635, "step": 1450}, {"loss": 1.1124, "grad_norm": 0.3847602903842926, "learning_rate": 0.0002, "epoch": 1.0707737440410707, "step": 1460}, {"loss": 1.1428, "grad_norm": 0.3693031370639801, "learning_rate": 0.0002, "epoch": 1.0781078107810782, "step": 1470}, {"loss": 1.0995, "grad_norm": 0.4111202359199524, "learning_rate": 0.0002, "epoch": 1.0854418775210855, "step": 1480}, {"loss": 1.0961, "grad_norm": 0.41452381014823914, "learning_rate": 0.0002, "epoch": 1.0927759442610927, "step": 1490}, {"loss": 1.1068, "grad_norm": 0.3336445093154907, "learning_rate": 0.0002, "epoch": 1.1001100110011002, "step": 1500}, {"loss": 1.0556, "grad_norm": 0.3923407793045044, "learning_rate": 0.0002, "epoch": 1.1074440777411074, "step": 1510}, {"loss": 1.1644, "grad_norm": 0.46215683221817017, "learning_rate": 0.0002, "epoch": 1.1147781444811147, "step": 1520}, {"loss": 1.1133, "grad_norm": 0.3592156767845154, "learning_rate": 0.0002, "epoch": 1.1221122112211221, "step": 1530}, {"loss": 1.0957, "grad_norm": 0.361110657453537, "learning_rate": 0.0002, "epoch": 1.1294462779611294, "step": 1540}, {"loss": 1.1553, "grad_norm": 0.5317131280899048, "learning_rate": 0.0002, "epoch": 1.1367803447011369, "step": 1550}, {"loss": 1.0368, "grad_norm": 0.3882388174533844, "learning_rate": 0.0002, "epoch": 1.1441144114411441, "step": 1560}, {"loss": 1.0805, "grad_norm": 0.3259428143501282, "learning_rate": 0.0002, "epoch": 1.1514484781811514, "step": 1570}, {"loss": 1.1819, "grad_norm": 0.410935640335083, "learning_rate": 0.0002, "epoch": 1.1587825449211588, "step": 1580}, {"loss": 1.1143, "grad_norm": 0.44940185546875, "learning_rate": 0.0002, "epoch": 1.166116611661166, "step": 1590}, {"loss": 1.0334, "grad_norm": 0.5106484293937683, "learning_rate": 0.0002, "epoch": 1.1734506784011733, "step": 1600}, {"loss": 1.2376, "grad_norm": 0.6603665947914124, "learning_rate": 0.0002, "epoch": 1.1807847451411808, "step": 1610}, {"loss": 1.1227, "grad_norm": 0.4799964129924774, "learning_rate": 0.0002, "epoch": 1.188118811881188, "step": 1620}, {"loss": 1.1191, "grad_norm": 0.4389883279800415, "learning_rate": 0.0002, "epoch": 1.1954528786211955, "step": 1630}, {"loss": 1.0667, "grad_norm": 0.4188813269138336, "learning_rate": 0.0002, "epoch": 1.2027869453612028, "step": 1640}, {"loss": 1.0605, "grad_norm": 0.7132157683372498, "learning_rate": 0.0002, "epoch": 1.21012101210121, "step": 1650}, {"loss": 1.0204, "grad_norm": 0.507480263710022, "learning_rate": 0.0002, "epoch": 1.2174550788412175, "step": 1660}, {"loss": 0.9948, "grad_norm": 0.9452332854270935, "learning_rate": 0.0002, "epoch": 1.2247891455812248, "step": 1670}, {"loss": 1.0228, "grad_norm": 0.4121614992618561, "learning_rate": 0.0002, "epoch": 1.2321232123212322, "step": 1680}, {"loss": 1.0366, "grad_norm": 0.34230247139930725, "learning_rate": 0.0002, "epoch": 1.2394572790612395, "step": 1690}, {"loss": 1.1289, "grad_norm": 0.4026208817958832, "learning_rate": 0.0002, "epoch": 1.2467913458012467, "step": 1700}, {"loss": 1.0206, "grad_norm": 0.46673697233200073, "learning_rate": 0.0002, "epoch": 1.2541254125412542, "step": 1710}, {"loss": 1.0827, "grad_norm": 0.38349825143814087, "learning_rate": 0.0002, "epoch": 1.2614594792812615, "step": 1720}, {"loss": 1.0356, "grad_norm": 0.4049997627735138, "learning_rate": 0.0002, "epoch": 1.2687935460212687, "step": 1730}, {"loss": 0.9504, "grad_norm": 0.3417615294456482, "learning_rate": 0.0002, "epoch": 1.2761276127612762, "step": 1740}, {"loss": 1.094, "grad_norm": 0.4277614951133728, "learning_rate": 0.0002, "epoch": 1.2834616795012834, "step": 1750}, {"loss": 0.9938, "grad_norm": 0.5864202976226807, "learning_rate": 0.0002, "epoch": 1.2907957462412907, "step": 1760}, {"loss": 1.1167, "grad_norm": 0.7097493410110474, "learning_rate": 0.0002, "epoch": 1.2981298129812981, "step": 1770}, {"loss": 1.1132, "grad_norm": 0.3145381212234497, "learning_rate": 0.0002, "epoch": 1.3054638797213054, "step": 1780}, {"loss": 1.1099, "grad_norm": 0.5116165280342102, "learning_rate": 0.0002, "epoch": 1.3127979464613129, "step": 1790}, {"loss": 1.0765, "grad_norm": 0.7469736337661743, "learning_rate": 0.0002, "epoch": 1.3201320132013201, "step": 1800}, {"loss": 1.0663, "grad_norm": 0.32272255420684814, "learning_rate": 0.0002, "epoch": 1.3274660799413276, "step": 1810}, {"loss": 0.9887, "grad_norm": 0.3534623086452484, "learning_rate": 0.0002, "epoch": 1.3348001466813348, "step": 1820}, {"loss": 1.1628, "grad_norm": 0.36127907037734985, "learning_rate": 0.0002, "epoch": 1.342134213421342, "step": 1830}, {"loss": 1.0972, "grad_norm": 0.4072401523590088, "learning_rate": 0.0002, "epoch": 1.3494682801613496, "step": 1840}, {"loss": 1.1267, "grad_norm": 0.3769161105155945, "learning_rate": 0.0002, "epoch": 1.3568023469013568, "step": 1850}, {"loss": 1.0173, "grad_norm": 0.412883460521698, "learning_rate": 0.0002, "epoch": 1.364136413641364, "step": 1860}, {"loss": 1.0265, "grad_norm": 0.3735875189304352, "learning_rate": 0.0002, "epoch": 1.3714704803813715, "step": 1870}, {"loss": 1.1061, "grad_norm": 0.39158159494400024, "learning_rate": 0.0002, "epoch": 1.3788045471213788, "step": 1880}, {"loss": 1.0433, "grad_norm": 0.44431769847869873, "learning_rate": 0.0002, "epoch": 1.386138613861386, "step": 1890}, {"loss": 1.0216, "grad_norm": 0.37772801518440247, "learning_rate": 0.0002, "epoch": 1.3934726806013935, "step": 1900}, {"loss": 1.0674, "grad_norm": 0.4056641757488251, "learning_rate": 0.0002, "epoch": 1.4008067473414008, "step": 1910}, {"loss": 1.0256, "grad_norm": 0.41612377762794495, "learning_rate": 0.0002, "epoch": 1.408140814081408, "step": 1920}, {"loss": 1.0467, "grad_norm": 0.41153013706207275, "learning_rate": 0.0002, "epoch": 1.4154748808214155, "step": 1930}, {"loss": 1.1062, "grad_norm": 0.387845516204834, "learning_rate": 0.0002, "epoch": 1.4228089475614227, "step": 1940}, {"loss": 1.1094, "grad_norm": 0.3809587061405182, "learning_rate": 0.0002, "epoch": 1.4301430143014302, "step": 1950}, {"loss": 1.0461, "grad_norm": 0.3625726103782654, "learning_rate": 0.0002, "epoch": 1.4374770810414375, "step": 1960}, {"loss": 0.9983, "grad_norm": 0.5294290781021118, "learning_rate": 0.0002, "epoch": 1.444811147781445, "step": 1970}, {"loss": 1.1114, "grad_norm": 0.39975494146347046, "learning_rate": 0.0002, "epoch": 1.4521452145214522, "step": 1980}, {"loss": 0.9704, "grad_norm": 0.4181167185306549, "learning_rate": 0.0002, "epoch": 1.4594792812614594, "step": 1990}, {"loss": 1.1146, "grad_norm": 0.42001503705978394, "learning_rate": 0.0002, "epoch": 1.466813348001467, "step": 2000}, {"loss": 1.1266, "grad_norm": 0.4877578616142273, "learning_rate": 0.0002, "epoch": 1.4741474147414741, "step": 2010}, {"loss": 1.1012, "grad_norm": 0.4050969183444977, "learning_rate": 0.0002, "epoch": 1.4814814814814814, "step": 2020}, {"loss": 1.0562, "grad_norm": 0.39068883657455444, "learning_rate": 0.0002, "epoch": 1.4888155482214889, "step": 2030}, {"loss": 1.0464, "grad_norm": 0.421282559633255, "learning_rate": 0.0002, "epoch": 1.4961496149614961, "step": 2040}, {"loss": 1.0532, "grad_norm": 0.47092297673225403, "learning_rate": 0.0002, "epoch": 1.5034836817015034, "step": 2050}, {"loss": 0.9348, "grad_norm": 0.39688974618911743, "learning_rate": 0.0002, "epoch": 1.5108177484415108, "step": 2060}, {"loss": 1.08, "grad_norm": 0.5529879331588745, "learning_rate": 0.0002, "epoch": 1.5181518151815183, "step": 2070}, {"loss": 1.1836, "grad_norm": 0.4879782199859619, "learning_rate": 0.0002, "epoch": 1.5254858819215253, "step": 2080}, {"loss": 1.0432, "grad_norm": 0.5517361164093018, "learning_rate": 0.0002, "epoch": 1.5328199486615328, "step": 2090}, {"loss": 1.0433, "grad_norm": 0.44015637040138245, "learning_rate": 0.0002, "epoch": 1.5401540154015403, "step": 2100}, {"loss": 1.1873, "grad_norm": 0.5435167551040649, "learning_rate": 0.0002, "epoch": 1.5474880821415475, "step": 2110}, {"loss": 1.1076, "grad_norm": 0.5714033246040344, "learning_rate": 0.0002, "epoch": 1.5548221488815548, "step": 2120}, {"loss": 1.1107, "grad_norm": 0.31732529401779175, "learning_rate": 0.0002, "epoch": 1.5621562156215623, "step": 2130}, {"loss": 1.0817, "grad_norm": 0.49068278074264526, "learning_rate": 0.0002, "epoch": 1.5694902823615695, "step": 2140}, {"loss": 1.0254, "grad_norm": 0.46851542592048645, "learning_rate": 0.0002, "epoch": 1.5768243491015768, "step": 2150}, {"loss": 1.0623, "grad_norm": 0.5083092451095581, "learning_rate": 0.0002, "epoch": 1.5841584158415842, "step": 2160}, {"loss": 1.0603, "grad_norm": 0.9822936058044434, "learning_rate": 0.0002, "epoch": 1.5914924825815915, "step": 2170}, {"loss": 0.9986, "grad_norm": 0.4575989246368408, "learning_rate": 0.0002, "epoch": 1.5988265493215987, "step": 2180}, {"loss": 1.1292, "grad_norm": 0.47444286942481995, "learning_rate": 0.0002, "epoch": 1.6061606160616062, "step": 2190}, {"loss": 1.0136, "grad_norm": 0.7208226919174194, "learning_rate": 0.0002, "epoch": 1.6134946828016135, "step": 2200}, {"loss": 1.15, "grad_norm": 0.43791481852531433, "learning_rate": 0.0002, "epoch": 1.6208287495416207, "step": 2210}, {"loss": 1.0961, "grad_norm": 0.5245792865753174, "learning_rate": 0.0002, "epoch": 1.6281628162816282, "step": 2220}, {"loss": 0.9957, "grad_norm": 0.39289429783821106, "learning_rate": 0.0002, "epoch": 1.6354968830216357, "step": 2230}, {"loss": 1.133, "grad_norm": 0.6106135845184326, "learning_rate": 0.0002, "epoch": 1.6428309497616427, "step": 2240}, {"loss": 1.0129, "grad_norm": 0.3722580671310425, "learning_rate": 0.0002, "epoch": 1.6501650165016502, "step": 2250}, {"loss": 1.0446, "grad_norm": 0.3649403750896454, "learning_rate": 0.0002, "epoch": 1.6574990832416576, "step": 2260}, {"loss": 1.0037, "grad_norm": 0.46514248847961426, "learning_rate": 0.0002, "epoch": 1.6648331499816649, "step": 2270}, {"loss": 1.0022, "grad_norm": 0.42034927010536194, "learning_rate": 0.0002, "epoch": 1.6721672167216721, "step": 2280}, {"loss": 1.1362, "grad_norm": 0.45202910900115967, "learning_rate": 0.0002, "epoch": 1.6795012834616796, "step": 2290}, {"loss": 1.0866, "grad_norm": 0.36257603764533997, "learning_rate": 0.0002, "epoch": 1.6868353502016868, "step": 2300}, {"loss": 1.0973, "grad_norm": 0.6340323090553284, "learning_rate": 0.0002, "epoch": 1.694169416941694, "step": 2310}, {"loss": 1.0615, "grad_norm": 0.4352878928184509, "learning_rate": 0.0002, "epoch": 1.7015034836817016, "step": 2320}, {"loss": 1.0629, "grad_norm": 0.45029792189598083, "learning_rate": 0.0002, "epoch": 1.7088375504217088, "step": 2330}, {"loss": 0.9621, "grad_norm": 0.3891315758228302, "learning_rate": 0.0002, "epoch": 1.716171617161716, "step": 2340}, {"loss": 0.9779, "grad_norm": 0.35180050134658813, "learning_rate": 0.0002, "epoch": 1.7235056839017235, "step": 2350}, {"loss": 1.0368, "grad_norm": 0.42367449402809143, "learning_rate": 0.0002, "epoch": 1.7308397506417308, "step": 2360}, {"loss": 1.0376, "grad_norm": 0.4553675353527069, "learning_rate": 0.0002, "epoch": 1.738173817381738, "step": 2370}, {"loss": 1.1467, "grad_norm": 0.5944654941558838, "learning_rate": 0.0002, "epoch": 1.7455078841217455, "step": 2380}, {"loss": 1.0548, "grad_norm": 0.3479664623737335, "learning_rate": 0.0002, "epoch": 1.752841950861753, "step": 2390}, {"loss": 1.0798, "grad_norm": 0.3585502505302429, "learning_rate": 0.0002, "epoch": 1.76017601760176, "step": 2400}, {"loss": 1.0983, "grad_norm": 0.4263346493244171, "learning_rate": 0.0002, "epoch": 1.7675100843417675, "step": 2410}, {"loss": 1.054, "grad_norm": 0.5476409196853638, "learning_rate": 0.0002, "epoch": 1.774844151081775, "step": 2420}, {"loss": 1.1615, "grad_norm": 0.3694186508655548, "learning_rate": 0.0002, "epoch": 1.7821782178217822, "step": 2430}, {"loss": 1.1343, "grad_norm": 0.9185658693313599, "learning_rate": 0.0002, "epoch": 1.7895122845617895, "step": 2440}, {"loss": 1.0764, "grad_norm": 0.7171908020973206, "learning_rate": 0.0002, "epoch": 1.796846351301797, "step": 2450}, {"loss": 1.1154, "grad_norm": 0.550658643245697, "learning_rate": 0.0002, "epoch": 1.8041804180418042, "step": 2460}, {"loss": 0.9975, "grad_norm": 0.4075568914413452, "learning_rate": 0.0002, "epoch": 1.8115144847818114, "step": 2470}, {"loss": 1.0935, "grad_norm": 0.3790127635002136, "learning_rate": 0.0002, "epoch": 1.818848551521819, "step": 2480}, {"loss": 0.9839, "grad_norm": 0.3576384484767914, "learning_rate": 0.0002, "epoch": 1.8261826182618262, "step": 2490}, {"loss": 1.1369, "grad_norm": 0.3919370770454407, "learning_rate": 0.0002, "epoch": 1.8335166850018334, "step": 2500}, {"loss": 0.9985, "grad_norm": 0.485083669424057, "learning_rate": 0.0002, "epoch": 1.8408507517418409, "step": 2510}, {"loss": 1.1585, "grad_norm": 0.4564347565174103, "learning_rate": 0.0002, "epoch": 1.8481848184818483, "step": 2520}, {"loss": 1.0944, "grad_norm": 0.3613106608390808, "learning_rate": 0.0002, "epoch": 1.8555188852218554, "step": 2530}, {"loss": 1.0819, "grad_norm": 0.39600759744644165, "learning_rate": 0.0002, "epoch": 1.8628529519618628, "step": 2540}, {"loss": 0.9453, "grad_norm": 1.123499870300293, "learning_rate": 0.0002, "epoch": 1.8701870187018703, "step": 2550}, {"loss": 1.0635, "grad_norm": 0.4612680673599243, "learning_rate": 0.0002, "epoch": 1.8775210854418776, "step": 2560}, {"loss": 1.0087, "grad_norm": 0.42745399475097656, "learning_rate": 0.0002, "epoch": 1.8848551521818848, "step": 2570}, {"loss": 1.0102, "grad_norm": 0.4055580198764801, "learning_rate": 0.0002, "epoch": 1.8921892189218923, "step": 2580}, {"loss": 1.0177, "grad_norm": 0.44174644351005554, "learning_rate": 0.0002, "epoch": 1.8995232856618995, "step": 2590}, {"loss": 0.9886, "grad_norm": 1.0228385925292969, "learning_rate": 0.0002, "epoch": 1.9068573524019068, "step": 2600}, {"loss": 1.0857, "grad_norm": 0.3496396243572235, "learning_rate": 0.0002, "epoch": 1.9141914191419143, "step": 2610}, {"loss": 1.0955, "grad_norm": 0.4191173017024994, "learning_rate": 0.0002, "epoch": 1.9215254858819215, "step": 2620}, {"loss": 1.0943, "grad_norm": 0.6778554916381836, "learning_rate": 0.0002, "epoch": 1.9288595526219288, "step": 2630}, {"loss": 1.0594, "grad_norm": 0.41992834210395813, "learning_rate": 0.0002, "epoch": 1.9361936193619362, "step": 2640}, {"loss": 1.1159, "grad_norm": 0.8760401010513306, "learning_rate": 0.0002, "epoch": 1.9435276861019435, "step": 2650}, {"loss": 1.0379, "grad_norm": 0.44049209356307983, "learning_rate": 0.0002, "epoch": 1.9508617528419507, "step": 2660}, {"loss": 1.1008, "grad_norm": 0.5651928782463074, "learning_rate": 0.0002, "epoch": 1.9581958195819582, "step": 2670}, {"loss": 1.1317, "grad_norm": 0.5292727947235107, "learning_rate": 0.0002, "epoch": 1.9655298863219657, "step": 2680}, {"loss": 1.1328, "grad_norm": 0.6012240648269653, "learning_rate": 0.0002, "epoch": 1.9728639530619727, "step": 2690}, {"loss": 1.0683, "grad_norm": 0.3945149779319763, "learning_rate": 0.0002, "epoch": 1.9801980198019802, "step": 2700}, {"loss": 1.0155, "grad_norm": 0.5732627511024475, "learning_rate": 0.0002, "epoch": 1.9875320865419877, "step": 2710}, {"loss": 0.9857, "grad_norm": 0.3963361084461212, "learning_rate": 0.0002, "epoch": 1.994866153281995, "step": 2720}, {"eval_loss": 1.1534006595611572, "eval_runtime": 32.7541, "eval_samples_per_second": 13.159, "eval_steps_per_second": 1.649, "epoch": 2.0, "step": 2727}, {"loss": 0.9624, "grad_norm": 0.48628315329551697, "learning_rate": 0.0002, "epoch": 2.002200220022002, "step": 2730}, {"loss": 0.9603, "grad_norm": 0.413875013589859, "learning_rate": 0.0002, "epoch": 2.0095342867620096, "step": 2740}, {"loss": 0.965, "grad_norm": 0.4988735616207123, "learning_rate": 0.0002, "epoch": 2.0168683535020167, "step": 2750}, {"loss": 0.9677, "grad_norm": 0.5634812712669373, "learning_rate": 0.0002, "epoch": 2.024202420242024, "step": 2760}, {"loss": 0.9547, "grad_norm": 0.48302653431892395, "learning_rate": 0.0002, "epoch": 2.0315364869820316, "step": 2770}, {"loss": 0.9346, "grad_norm": 0.49914175271987915, "learning_rate": 0.0002, "epoch": 2.038870553722039, "step": 2780}, {"loss": 0.904, "grad_norm": 1.14039945602417, "learning_rate": 0.0002, "epoch": 2.046204620462046, "step": 2790}, {"loss": 0.9588, "grad_norm": 0.6359720826148987, "learning_rate": 0.0002, "epoch": 2.0535386872020536, "step": 2800}, {"loss": 0.9031, "grad_norm": 0.4589158296585083, "learning_rate": 0.0002, "epoch": 2.060872753942061, "step": 2810}, {"loss": 0.9438, "grad_norm": 0.46255481243133545, "learning_rate": 0.0002, "epoch": 2.068206820682068, "step": 2820}, {"loss": 0.9464, "grad_norm": 0.6232137680053711, "learning_rate": 0.0002, "epoch": 2.0755408874220755, "step": 2830}, {"loss": 0.8978, "grad_norm": 0.41042178869247437, "learning_rate": 0.0002, "epoch": 2.082874954162083, "step": 2840}, {"loss": 0.8516, "grad_norm": 0.5334428548812866, "learning_rate": 0.0002, "epoch": 2.09020902090209, "step": 2850}, {"loss": 0.9313, "grad_norm": 0.8270058631896973, "learning_rate": 0.0002, "epoch": 2.0975430876420975, "step": 2860}, {"loss": 1.0064, "grad_norm": 0.6624533534049988, "learning_rate": 0.0002, "epoch": 2.104877154382105, "step": 2870}, {"loss": 0.9196, "grad_norm": 0.5448863506317139, "learning_rate": 0.0002, "epoch": 2.112211221122112, "step": 2880}, {"loss": 0.887, "grad_norm": 0.621482789516449, "learning_rate": 0.0002, "epoch": 2.1195452878621195, "step": 2890}, {"loss": 0.9702, "grad_norm": 0.4556255340576172, "learning_rate": 0.0002, "epoch": 2.126879354602127, "step": 2900}, {"loss": 0.9323, "grad_norm": 0.4620579183101654, "learning_rate": 0.0002, "epoch": 2.1342134213421344, "step": 2910}, {"loss": 0.836, "grad_norm": 0.9602415561676025, "learning_rate": 0.0002, "epoch": 2.1415474880821415, "step": 2920}, {"loss": 0.8826, "grad_norm": 0.587943971157074, "learning_rate": 0.0002, "epoch": 2.148881554822149, "step": 2930}, {"loss": 0.971, "grad_norm": 0.5121372938156128, "learning_rate": 0.0002, "epoch": 2.1562156215621564, "step": 2940}, {"loss": 0.8751, "grad_norm": 0.49424484372138977, "learning_rate": 0.0002, "epoch": 2.1635496883021634, "step": 2950}, {"loss": 0.8674, "grad_norm": 0.6312560439109802, "learning_rate": 0.0002, "epoch": 2.170883755042171, "step": 2960}, {"loss": 0.9791, "grad_norm": 0.5235576629638672, "learning_rate": 0.0002, "epoch": 2.1782178217821784, "step": 2970}, {"loss": 0.9706, "grad_norm": 0.5868439674377441, "learning_rate": 0.0002, "epoch": 2.1855518885221854, "step": 2980}, {"loss": 0.9338, "grad_norm": 0.42302873730659485, "learning_rate": 0.0002, "epoch": 2.192885955262193, "step": 2990}, {"loss": 0.9332, "grad_norm": 0.5097725987434387, "learning_rate": 0.0002, "epoch": 2.2002200220022003, "step": 3000}, {"loss": 0.9239, "grad_norm": 0.5091572403907776, "learning_rate": 0.0002, "epoch": 2.2075540887422074, "step": 3010}, {"loss": 0.8898, "grad_norm": 0.49433162808418274, "learning_rate": 0.0002, "epoch": 2.214888155482215, "step": 3020}, {"loss": 0.9734, "grad_norm": 0.5577368140220642, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 3030}, {"loss": 0.9033, "grad_norm": 0.6177583932876587, "learning_rate": 0.0002, "epoch": 2.2295562889622293, "step": 3040}, {"loss": 0.9882, "grad_norm": 0.5256719589233398, "learning_rate": 0.0002, "epoch": 2.236890355702237, "step": 3050}, {"loss": 0.9439, "grad_norm": 0.5001118183135986, "learning_rate": 0.0002, "epoch": 2.2442244224422443, "step": 3060}, {"loss": 0.8718, "grad_norm": 0.5721249580383301, "learning_rate": 0.0002, "epoch": 2.2515584891822513, "step": 3070}, {"loss": 1.0648, "grad_norm": 0.5325384140014648, "learning_rate": 0.0002, "epoch": 2.258892555922259, "step": 3080}, {"loss": 0.9843, "grad_norm": 0.5719189047813416, "learning_rate": 0.0002, "epoch": 2.2662266226622663, "step": 3090}, {"loss": 0.8633, "grad_norm": 0.6337835788726807, "learning_rate": 0.0002, "epoch": 2.2735606894022737, "step": 3100}, {"loss": 0.9962, "grad_norm": 0.5381836891174316, "learning_rate": 0.0002, "epoch": 2.2808947561422808, "step": 3110}, {"loss": 0.8265, "grad_norm": 0.5408531427383423, "learning_rate": 0.0002, "epoch": 2.2882288228822882, "step": 3120}, {"loss": 1.0325, "grad_norm": 0.43705281615257263, "learning_rate": 0.0002, "epoch": 2.2955628896222957, "step": 3130}, {"loss": 0.9388, "grad_norm": 0.6454030275344849, "learning_rate": 0.0002, "epoch": 2.3028969563623027, "step": 3140}, {"loss": 0.954, "grad_norm": 0.686030387878418, "learning_rate": 0.0002, "epoch": 2.31023102310231, "step": 3150}, {"loss": 0.9403, "grad_norm": 0.5123633146286011, "learning_rate": 0.0002, "epoch": 2.3175650898423177, "step": 3160}, {"loss": 0.8834, "grad_norm": 0.842506468296051, "learning_rate": 0.0002, "epoch": 2.3248991565823247, "step": 3170}, {"loss": 1.0497, "grad_norm": 0.5193818807601929, "learning_rate": 0.0002, "epoch": 2.332233223322332, "step": 3180}, {"loss": 0.9473, "grad_norm": 0.5634409189224243, "learning_rate": 0.0002, "epoch": 2.3395672900623397, "step": 3190}, {"loss": 0.8499, "grad_norm": 0.6475534439086914, "learning_rate": 0.0002, "epoch": 2.3469013568023467, "step": 3200}, {"loss": 0.874, "grad_norm": 1.1503914594650269, "learning_rate": 0.0002, "epoch": 2.354235423542354, "step": 3210}, {"loss": 0.9762, "grad_norm": 0.7234905362129211, "learning_rate": 0.0002, "epoch": 2.3615694902823616, "step": 3220}, {"loss": 0.9007, "grad_norm": 0.664903461933136, "learning_rate": 0.0002, "epoch": 2.368903557022369, "step": 3230}, {"loss": 0.9987, "grad_norm": 0.5453006625175476, "learning_rate": 0.0002, "epoch": 2.376237623762376, "step": 3240}, {"loss": 0.9742, "grad_norm": 0.6256654262542725, "learning_rate": 0.0002, "epoch": 2.3835716905023836, "step": 3250}, {"loss": 0.9922, "grad_norm": 0.5166565179824829, "learning_rate": 0.0002, "epoch": 2.390905757242391, "step": 3260}, {"loss": 0.927, "grad_norm": 0.5699098110198975, "learning_rate": 0.0002, "epoch": 2.398239823982398, "step": 3270}, {"loss": 0.8878, "grad_norm": 0.4472540020942688, "learning_rate": 0.0002, "epoch": 2.4055738907224056, "step": 3280}, {"loss": 0.9439, "grad_norm": 0.6790403127670288, "learning_rate": 0.0002, "epoch": 2.412907957462413, "step": 3290}, {"loss": 0.972, "grad_norm": 0.5182185173034668, "learning_rate": 0.0002, "epoch": 2.42024202420242, "step": 3300}, {"loss": 0.9775, "grad_norm": 0.564647912979126, "learning_rate": 0.0002, "epoch": 2.4275760909424275, "step": 3310}, {"loss": 1.072, "grad_norm": 0.5625313520431519, "learning_rate": 0.0002, "epoch": 2.434910157682435, "step": 3320}, {"loss": 0.8798, "grad_norm": 0.7496559619903564, "learning_rate": 0.0002, "epoch": 2.442244224422442, "step": 3330}, {"loss": 0.868, "grad_norm": 0.4779128134250641, "learning_rate": 0.0002, "epoch": 2.4495782911624495, "step": 3340}, {"loss": 1.0316, "grad_norm": 0.578093409538269, "learning_rate": 0.0002, "epoch": 2.456912357902457, "step": 3350}, {"loss": 0.9282, "grad_norm": 0.5456080436706543, "learning_rate": 0.0002, "epoch": 2.4642464246424645, "step": 3360}, {"loss": 0.8409, "grad_norm": 0.4769273102283478, "learning_rate": 0.0002, "epoch": 2.4715804913824715, "step": 3370}, {"loss": 0.9312, "grad_norm": 0.5608189702033997, "learning_rate": 0.0002, "epoch": 2.478914558122479, "step": 3380}, {"loss": 0.9934, "grad_norm": 0.5590165853500366, "learning_rate": 0.0002, "epoch": 2.4862486248624864, "step": 3390}, {"loss": 1.025, "grad_norm": 0.801306962966919, "learning_rate": 0.0002, "epoch": 2.4935826916024935, "step": 3400}, {"loss": 0.9049, "grad_norm": 0.6045624613761902, "learning_rate": 0.0002, "epoch": 2.500916758342501, "step": 3410}, {"loss": 0.944, "grad_norm": 0.5735858082771301, "learning_rate": 0.0002, "epoch": 2.5082508250825084, "step": 3420}, {"loss": 0.9846, "grad_norm": 0.6827309131622314, "learning_rate": 0.0002, "epoch": 2.5155848918225154, "step": 3430}, {"loss": 0.9789, "grad_norm": 0.5702602863311768, "learning_rate": 0.0002, "epoch": 2.522918958562523, "step": 3440}, {"loss": 0.9127, "grad_norm": 0.6674721240997314, "learning_rate": 0.0002, "epoch": 2.5302530253025304, "step": 3450}, {"loss": 0.914, "grad_norm": 0.5635907649993896, "learning_rate": 0.0002, "epoch": 2.5375870920425374, "step": 3460}, {"loss": 0.8398, "grad_norm": 0.42737770080566406, "learning_rate": 0.0002, "epoch": 2.544921158782545, "step": 3470}, {"loss": 0.9474, "grad_norm": 0.6720691919326782, "learning_rate": 0.0002, "epoch": 2.5522552255225524, "step": 3480}, {"loss": 0.8637, "grad_norm": 0.8917084336280823, "learning_rate": 0.0002, "epoch": 2.55958929226256, "step": 3490}, {"loss": 0.9257, "grad_norm": 0.5134549140930176, "learning_rate": 0.0002, "epoch": 2.566923359002567, "step": 3500}, {"loss": 0.9362, "grad_norm": 0.4951367974281311, "learning_rate": 0.0002, "epoch": 2.5742574257425743, "step": 3510}, {"loss": 0.9184, "grad_norm": 0.9438204765319824, "learning_rate": 0.0002, "epoch": 2.5815914924825814, "step": 3520}, {"loss": 0.8939, "grad_norm": 0.6024714708328247, "learning_rate": 0.0002, "epoch": 2.588925559222589, "step": 3530}, {"loss": 0.9298, "grad_norm": 0.5248535871505737, "learning_rate": 0.0002, "epoch": 2.5962596259625963, "step": 3540}, {"loss": 0.941, "grad_norm": 0.8677568435668945, "learning_rate": 0.0002, "epoch": 2.6035936927026038, "step": 3550}, {"loss": 0.9253, "grad_norm": 0.82008296251297, "learning_rate": 0.0002, "epoch": 2.610927759442611, "step": 3560}, {"loss": 0.8429, "grad_norm": 0.4724634885787964, "learning_rate": 0.0002, "epoch": 2.6182618261826183, "step": 3570}, {"loss": 0.9058, "grad_norm": 0.5434244275093079, "learning_rate": 0.0002, "epoch": 2.6255958929226257, "step": 3580}, {"loss": 0.9379, "grad_norm": 0.4948740005493164, "learning_rate": 0.0002, "epoch": 2.6329299596626328, "step": 3590}, {"loss": 0.8718, "grad_norm": 0.42109328508377075, "learning_rate": 0.0002, "epoch": 2.6402640264026402, "step": 3600}, {"loss": 0.9809, "grad_norm": 0.7979786396026611, "learning_rate": 0.0002, "epoch": 2.6475980931426477, "step": 3610}, {"loss": 0.9229, "grad_norm": 0.6345919370651245, "learning_rate": 0.0002, "epoch": 2.654932159882655, "step": 3620}, {"loss": 0.8506, "grad_norm": 0.4971671402454376, "learning_rate": 0.0002, "epoch": 2.662266226622662, "step": 3630}, {"loss": 0.8054, "grad_norm": 0.6467748284339905, "learning_rate": 0.0002, "epoch": 2.6696002933626697, "step": 3640}, {"loss": 0.9277, "grad_norm": 0.4240160286426544, "learning_rate": 0.0002, "epoch": 2.6769343601026767, "step": 3650}, {"loss": 0.8213, "grad_norm": 0.5179754495620728, "learning_rate": 0.0002, "epoch": 2.684268426842684, "step": 3660}, {"loss": 0.9221, "grad_norm": 0.754012405872345, "learning_rate": 0.0002, "epoch": 2.6916024935826917, "step": 3670}, {"loss": 0.9194, "grad_norm": 0.5141299962997437, "learning_rate": 0.0002, "epoch": 2.698936560322699, "step": 3680}, {"loss": 0.9495, "grad_norm": 0.5737819075584412, "learning_rate": 0.0002, "epoch": 2.706270627062706, "step": 3690}, {"loss": 1.0162, "grad_norm": 0.5887577533721924, "learning_rate": 0.0002, "epoch": 2.7136046938027136, "step": 3700}, {"loss": 0.9169, "grad_norm": 0.6740471720695496, "learning_rate": 0.0002, "epoch": 2.720938760542721, "step": 3710}, {"loss": 0.9297, "grad_norm": 0.5879453420639038, "learning_rate": 0.0002, "epoch": 2.728272827282728, "step": 3720}, {"loss": 0.9358, "grad_norm": 0.4858354926109314, "learning_rate": 0.0002, "epoch": 2.7356068940227356, "step": 3730}, {"loss": 0.9308, "grad_norm": 0.5489001870155334, "learning_rate": 0.0002, "epoch": 2.742940960762743, "step": 3740}, {"loss": 0.894, "grad_norm": 0.8187092542648315, "learning_rate": 0.0002, "epoch": 2.7502750275027505, "step": 3750}, {"loss": 0.8954, "grad_norm": 0.5666626691818237, "learning_rate": 0.0002, "epoch": 2.7576090942427576, "step": 3760}, {"loss": 1.0059, "grad_norm": 0.5377066135406494, "learning_rate": 0.0002, "epoch": 2.764943160982765, "step": 3770}, {"loss": 0.9132, "grad_norm": 0.566330075263977, "learning_rate": 0.0002, "epoch": 2.772277227722772, "step": 3780}, {"loss": 0.9415, "grad_norm": 0.5522832870483398, "learning_rate": 0.0002, "epoch": 2.7796112944627795, "step": 3790}, {"loss": 0.8816, "grad_norm": 0.5668695569038391, "learning_rate": 0.0002, "epoch": 2.786945361202787, "step": 3800}, {"loss": 0.8885, "grad_norm": 0.7566602826118469, "learning_rate": 0.0002, "epoch": 2.7942794279427945, "step": 3810}, {"loss": 0.8598, "grad_norm": 0.5603684782981873, "learning_rate": 0.0002, "epoch": 2.8016134946828015, "step": 3820}, {"loss": 0.9602, "grad_norm": 0.49122217297554016, "learning_rate": 0.0002, "epoch": 2.808947561422809, "step": 3830}, {"loss": 0.9738, "grad_norm": 0.6798251867294312, "learning_rate": 0.0002, "epoch": 2.816281628162816, "step": 3840}, {"loss": 0.9533, "grad_norm": 0.6097991466522217, "learning_rate": 0.0002, "epoch": 2.8236156949028235, "step": 3850}, {"loss": 0.8672, "grad_norm": 0.6675726175308228, "learning_rate": 0.0002, "epoch": 2.830949761642831, "step": 3860}, {"loss": 0.9324, "grad_norm": 0.9223952889442444, "learning_rate": 0.0002, "epoch": 2.8382838283828384, "step": 3870}, {"loss": 0.8767, "grad_norm": 0.6020799875259399, "learning_rate": 0.0002, "epoch": 2.8456178951228455, "step": 3880}, {"loss": 0.9148, "grad_norm": 0.5206381678581238, "learning_rate": 0.0002, "epoch": 2.852951961862853, "step": 3890}, {"loss": 0.9479, "grad_norm": 0.6268777251243591, "learning_rate": 0.0002, "epoch": 2.8602860286028604, "step": 3900}, {"loss": 0.9409, "grad_norm": 1.1583497524261475, "learning_rate": 0.0002, "epoch": 2.8676200953428674, "step": 3910}, {"loss": 0.895, "grad_norm": 0.7263903021812439, "learning_rate": 0.0002, "epoch": 2.874954162082875, "step": 3920}, {"loss": 0.8786, "grad_norm": 0.5369910001754761, "learning_rate": 0.0002, "epoch": 2.8822882288228824, "step": 3930}, {"loss": 1.0015, "grad_norm": 0.7298350930213928, "learning_rate": 0.0002, "epoch": 2.88962229556289, "step": 3940}, {"loss": 0.979, "grad_norm": 0.577012836933136, "learning_rate": 0.0002, "epoch": 2.896956362302897, "step": 3950}, {"loss": 0.9716, "grad_norm": 0.5859594345092773, "learning_rate": 0.0002, "epoch": 2.9042904290429044, "step": 3960}, {"loss": 0.8772, "grad_norm": 0.47176122665405273, "learning_rate": 0.0002, "epoch": 2.9116244957829114, "step": 3970}, {"loss": 0.8997, "grad_norm": 0.9699620604515076, "learning_rate": 0.0002, "epoch": 2.918958562522919, "step": 3980}, {"loss": 0.9057, "grad_norm": 0.7908747792243958, "learning_rate": 0.0002, "epoch": 2.9262926292629263, "step": 3990}, {"loss": 0.9462, "grad_norm": 0.5777379274368286, "learning_rate": 0.0002, "epoch": 2.933626696002934, "step": 4000}, {"loss": 0.9358, "grad_norm": 0.599288284778595, "learning_rate": 0.0002, "epoch": 2.940960762742941, "step": 4010}, {"loss": 0.9812, "grad_norm": 0.5232274532318115, "learning_rate": 0.0002, "epoch": 2.9482948294829483, "step": 4020}, {"loss": 0.96, "grad_norm": 0.6395137310028076, "learning_rate": 0.0002, "epoch": 2.9556288962229558, "step": 4030}, {"loss": 0.9813, "grad_norm": 0.589260458946228, "learning_rate": 0.0002, "epoch": 2.962962962962963, "step": 4040}, {"loss": 0.9541, "grad_norm": 0.5699581503868103, "learning_rate": 0.0002, "epoch": 2.9702970297029703, "step": 4050}, {"loss": 0.9585, "grad_norm": 0.528468132019043, "learning_rate": 0.0002, "epoch": 2.9776310964429777, "step": 4060}, {"loss": 0.9164, "grad_norm": 0.4804670512676239, "learning_rate": 0.0002, "epoch": 2.984965163182985, "step": 4070}, {"loss": 0.9771, "grad_norm": 1.1918889284133911, "learning_rate": 0.0002, "epoch": 2.9922992299229922, "step": 4080}, {"loss": 0.9178, "grad_norm": 0.5479103326797485, "learning_rate": 0.0002, "epoch": 2.9996332966629997, "step": 4090}, {"eval_loss": 1.1642853021621704, "eval_runtime": 32.7511, "eval_samples_per_second": 13.16, "eval_steps_per_second": 1.649, "epoch": 2.9996332966629997, "step": 4090}, {"loss": 0.7981, "grad_norm": 0.7430027723312378, "learning_rate": 0.0002, "epoch": 3.006967363403007, "step": 4100}, {"loss": 0.7871, "grad_norm": 0.6293647289276123, "learning_rate": 0.0002, "epoch": 3.014301430143014, "step": 4110}, {"loss": 0.78, "grad_norm": 0.6191329956054688, "learning_rate": 0.0002, "epoch": 3.0216354968830217, "step": 4120}, {"loss": 0.7618, "grad_norm": 0.7959313988685608, "learning_rate": 0.0002, "epoch": 3.028969563623029, "step": 4130}, {"loss": 0.8039, "grad_norm": 0.5956351161003113, "learning_rate": 0.0002, "epoch": 3.036303630363036, "step": 4140}, {"loss": 0.7477, "grad_norm": 0.670383632183075, "learning_rate": 0.0002, "epoch": 3.0436376971030437, "step": 4150}, {"loss": 0.7984, "grad_norm": 0.6414518356323242, "learning_rate": 0.0002, "epoch": 3.050971763843051, "step": 4160}, {"loss": 0.7369, "grad_norm": 0.7928852438926697, "learning_rate": 0.0002, "epoch": 3.058305830583058, "step": 4170}, {"loss": 0.7914, "grad_norm": 0.6211121082305908, "learning_rate": 0.0002, "epoch": 3.0656398973230656, "step": 4180}, {"loss": 0.7365, "grad_norm": 0.6237057447433472, "learning_rate": 0.0002, "epoch": 3.072973964063073, "step": 4190}, {"loss": 0.702, "grad_norm": 0.6522233486175537, "learning_rate": 0.0002, "epoch": 3.08030803080308, "step": 4200}, {"loss": 0.7646, "grad_norm": 0.9396848678588867, "learning_rate": 0.0002, "epoch": 3.0876420975430876, "step": 4210}, {"loss": 0.7559, "grad_norm": 0.8003010749816895, "learning_rate": 0.0002, "epoch": 3.094976164283095, "step": 4220}, {"loss": 0.711, "grad_norm": 0.6733810305595398, "learning_rate": 0.0002, "epoch": 3.102310231023102, "step": 4230}, {"loss": 0.696, "grad_norm": 0.6365828514099121, "learning_rate": 0.0002, "epoch": 3.1096442977631096, "step": 4240}, {"loss": 0.8362, "grad_norm": 1.0805548429489136, "learning_rate": 0.0002, "epoch": 3.116978364503117, "step": 4250}, {"loss": 0.7651, "grad_norm": 0.7262141108512878, "learning_rate": 0.0002, "epoch": 3.1243124312431245, "step": 4260}, {"loss": 0.7304, "grad_norm": 0.5500539541244507, "learning_rate": 0.0002, "epoch": 3.1316464979831315, "step": 4270}, {"loss": 0.7721, "grad_norm": 0.793912947177887, "learning_rate": 0.0002, "epoch": 3.138980564723139, "step": 4280}, {"loss": 0.7708, "grad_norm": 1.2540518045425415, "learning_rate": 0.0002, "epoch": 3.1463146314631465, "step": 4290}, {"loss": 0.782, "grad_norm": 0.7020077705383301, "learning_rate": 0.0002, "epoch": 3.1536486982031535, "step": 4300}, {"loss": 0.7253, "grad_norm": 0.5111123323440552, "learning_rate": 0.0002, "epoch": 3.160982764943161, "step": 4310}, {"loss": 0.8159, "grad_norm": 0.7172090411186218, "learning_rate": 0.0002, "epoch": 3.1683168316831685, "step": 4320}, {"loss": 0.6962, "grad_norm": 0.6343168616294861, "learning_rate": 0.0002, "epoch": 3.1756508984231755, "step": 4330}, {"loss": 0.7938, "grad_norm": 0.9563672542572021, "learning_rate": 0.0002, "epoch": 3.182984965163183, "step": 4340}, {"loss": 0.7385, "grad_norm": 1.0225574970245361, "learning_rate": 0.0002, "epoch": 3.1903190319031904, "step": 4350}, {"loss": 0.8652, "grad_norm": 1.1633386611938477, "learning_rate": 0.0002, "epoch": 3.1976530986431975, "step": 4360}, {"loss": 0.7259, "grad_norm": 0.8915148973464966, "learning_rate": 0.0002, "epoch": 3.204987165383205, "step": 4370}, {"loss": 0.8061, "grad_norm": 0.9156812429428101, "learning_rate": 0.0002, "epoch": 3.2123212321232124, "step": 4380}, {"loss": 0.8189, "grad_norm": 0.6363258957862854, "learning_rate": 0.0002, "epoch": 3.21965529886322, "step": 4390}, {"loss": 0.7996, "grad_norm": 0.579099178314209, "learning_rate": 0.0002, "epoch": 3.226989365603227, "step": 4400}, {"loss": 0.8592, "grad_norm": 0.8778146505355835, "learning_rate": 0.0002, "epoch": 3.2343234323432344, "step": 4410}, {"loss": 0.8281, "grad_norm": 0.8356770873069763, "learning_rate": 0.0002, "epoch": 3.241657499083242, "step": 4420}, {"loss": 0.8484, "grad_norm": 0.702032208442688, "learning_rate": 0.0002, "epoch": 3.248991565823249, "step": 4430}, {"loss": 0.7227, "grad_norm": 0.6386539340019226, "learning_rate": 0.0002, "epoch": 3.2563256325632564, "step": 4440}, {"loss": 0.8374, "grad_norm": 0.7008408904075623, "learning_rate": 0.0002, "epoch": 3.263659699303264, "step": 4450}, {"loss": 0.7572, "grad_norm": 0.9556332230567932, "learning_rate": 0.0002, "epoch": 3.270993766043271, "step": 4460}, {"loss": 0.743, "grad_norm": 0.5667835474014282, "learning_rate": 0.0002, "epoch": 3.2783278327832783, "step": 4470}, {"loss": 0.8152, "grad_norm": 0.8239172697067261, "learning_rate": 0.0002, "epoch": 3.285661899523286, "step": 4480}, {"loss": 0.756, "grad_norm": 0.7045050859451294, "learning_rate": 0.0002, "epoch": 3.292995966263293, "step": 4490}, {"loss": 0.7655, "grad_norm": 0.7131434082984924, "learning_rate": 0.0002, "epoch": 3.3003300330033003, "step": 4500}, {"loss": 0.836, "grad_norm": 0.6924910545349121, "learning_rate": 0.0002, "epoch": 3.3076640997433078, "step": 4510}, {"loss": 0.736, "grad_norm": 0.8945356607437134, "learning_rate": 0.0002, "epoch": 3.3149981664833152, "step": 4520}, {"loss": 0.7575, "grad_norm": 0.6546903252601624, "learning_rate": 0.0002, "epoch": 3.3223322332233223, "step": 4530}, {"loss": 0.7893, "grad_norm": 0.8206679224967957, "learning_rate": 0.0002, "epoch": 3.3296662999633297, "step": 4540}, {"loss": 0.7502, "grad_norm": 0.6482203602790833, "learning_rate": 0.0002, "epoch": 3.3370003667033368, "step": 4550}, {"loss": 0.8172, "grad_norm": 0.7558760046958923, "learning_rate": 0.0002, "epoch": 3.3443344334433442, "step": 4560}, {"loss": 0.744, "grad_norm": 0.7794756889343262, "learning_rate": 0.0002, "epoch": 3.3516685001833517, "step": 4570}, {"loss": 0.7385, "grad_norm": 0.7382805943489075, "learning_rate": 0.0002, "epoch": 3.359002566923359, "step": 4580}, {"loss": 0.8511, "grad_norm": 0.5912511944770813, "learning_rate": 0.0002, "epoch": 3.366336633663366, "step": 4590}, {"loss": 0.8272, "grad_norm": 0.7444885969161987, "learning_rate": 0.0002, "epoch": 3.3736707004033737, "step": 4600}, {"loss": 0.7927, "grad_norm": 0.7354922890663147, "learning_rate": 0.0002, "epoch": 3.381004767143381, "step": 4610}, {"loss": 0.7183, "grad_norm": 0.7685934901237488, "learning_rate": 0.0002, "epoch": 3.388338833883388, "step": 4620}, {"loss": 0.7436, "grad_norm": 0.61041259765625, "learning_rate": 0.0002, "epoch": 3.3956729006233957, "step": 4630}, {"loss": 0.7661, "grad_norm": 0.6820451021194458, "learning_rate": 0.0002, "epoch": 3.403006967363403, "step": 4640}, {"loss": 0.8796, "grad_norm": 0.5819534063339233, "learning_rate": 0.0002, "epoch": 3.41034103410341, "step": 4650}, {"loss": 0.7314, "grad_norm": 0.705410897731781, "learning_rate": 0.0002, "epoch": 3.4176751008434176, "step": 4660}, {"loss": 0.7901, "grad_norm": 0.8052892088890076, "learning_rate": 0.0002, "epoch": 3.425009167583425, "step": 4670}, {"loss": 0.7298, "grad_norm": 0.7746483087539673, "learning_rate": 0.0002, "epoch": 3.432343234323432, "step": 4680}, {"loss": 0.7976, "grad_norm": 0.7713689804077148, "learning_rate": 0.0002, "epoch": 3.4396773010634396, "step": 4690}, {"loss": 0.7427, "grad_norm": 0.810371994972229, "learning_rate": 0.0002, "epoch": 3.447011367803447, "step": 4700}, {"loss": 0.7594, "grad_norm": 0.7702969312667847, "learning_rate": 0.0002, "epoch": 3.4543454345434546, "step": 4710}, {"loss": 0.7957, "grad_norm": 0.7069268822669983, "learning_rate": 0.0002, "epoch": 3.4616795012834616, "step": 4720}, {"loss": 0.8199, "grad_norm": 0.7640359401702881, "learning_rate": 0.0002, "epoch": 3.469013568023469, "step": 4730}, {"loss": 0.6875, "grad_norm": 0.8661707639694214, "learning_rate": 0.0002, "epoch": 3.4763476347634765, "step": 4740}, {"loss": 0.8528, "grad_norm": 0.9970282912254333, "learning_rate": 0.0002, "epoch": 3.4836817015034836, "step": 4750}, {"loss": 0.8462, "grad_norm": 0.5824355483055115, "learning_rate": 0.0002, "epoch": 3.491015768243491, "step": 4760}, {"loss": 0.851, "grad_norm": 1.3072649240493774, "learning_rate": 0.0002, "epoch": 3.4983498349834985, "step": 4770}, {"loss": 0.9101, "grad_norm": 0.873978316783905, "learning_rate": 0.0002, "epoch": 3.5056839017235055, "step": 4780}, {"loss": 0.7403, "grad_norm": 0.5526657104492188, "learning_rate": 0.0002, "epoch": 3.513017968463513, "step": 4790}, {"loss": 0.7921, "grad_norm": 0.790894627571106, "learning_rate": 0.0002, "epoch": 3.5203520352035205, "step": 4800}, {"loss": 0.831, "grad_norm": 0.8119630217552185, "learning_rate": 0.0002, "epoch": 3.5276861019435275, "step": 4810}, {"loss": 0.7351, "grad_norm": 0.633212149143219, "learning_rate": 0.0002, "epoch": 3.535020168683535, "step": 4820}, {"loss": 0.8505, "grad_norm": 0.703029990196228, "learning_rate": 0.0002, "epoch": 3.5423542354235424, "step": 4830}, {"loss": 0.7204, "grad_norm": 0.7603771686553955, "learning_rate": 0.0002, "epoch": 3.54968830216355, "step": 4840}, {"loss": 0.8868, "grad_norm": 0.6260480880737305, "learning_rate": 0.0002, "epoch": 3.557022368903557, "step": 4850}, {"loss": 0.8137, "grad_norm": 0.8203664422035217, "learning_rate": 0.0002, "epoch": 3.5643564356435644, "step": 4860}, {"loss": 0.8821, "grad_norm": 0.7793813347816467, "learning_rate": 0.0002, "epoch": 3.5716905023835714, "step": 4870}, {"loss": 0.8164, "grad_norm": 0.7667397260665894, "learning_rate": 0.0002, "epoch": 3.579024569123579, "step": 4880}, {"loss": 0.7597, "grad_norm": 0.8198829889297485, "learning_rate": 0.0002, "epoch": 3.5863586358635864, "step": 4890}, {"loss": 0.7027, "grad_norm": 0.7689233422279358, "learning_rate": 0.0002, "epoch": 3.593692702603594, "step": 4900}, {"loss": 0.804, "grad_norm": 0.7870983481407166, "learning_rate": 0.0002, "epoch": 3.601026769343601, "step": 4910}, {"loss": 0.8269, "grad_norm": 0.8133853077888489, "learning_rate": 0.0002, "epoch": 3.6083608360836084, "step": 4920}, {"loss": 0.8515, "grad_norm": 1.308401346206665, "learning_rate": 0.0002, "epoch": 3.615694902823616, "step": 4930}, {"loss": 0.8494, "grad_norm": 0.7131121754646301, "learning_rate": 0.0002, "epoch": 3.623028969563623, "step": 4940}, {"loss": 0.7235, "grad_norm": 0.6825910210609436, "learning_rate": 0.0002, "epoch": 3.6303630363036303, "step": 4950}, {"loss": 0.7824, "grad_norm": 0.7254678606987, "learning_rate": 0.0002, "epoch": 3.637697103043638, "step": 4960}, {"loss": 0.7983, "grad_norm": 0.8045085072517395, "learning_rate": 0.0002, "epoch": 3.6450311697836453, "step": 4970}, {"loss": 0.8223, "grad_norm": 0.6991777420043945, "learning_rate": 0.0002, "epoch": 3.6523652365236523, "step": 4980}, {"loss": 0.7806, "grad_norm": 0.7804713249206543, "learning_rate": 0.0002, "epoch": 3.6596993032636598, "step": 4990}, {"loss": 0.8402, "grad_norm": 0.8525708317756653, "learning_rate": 0.0002, "epoch": 3.667033370003667, "step": 5000}, {"loss": 0.8496, "grad_norm": 0.7959994673728943, "learning_rate": 0.0002, "epoch": 3.6743674367436743, "step": 5010}, {"loss": 0.8022, "grad_norm": 0.8103628158569336, "learning_rate": 0.0002, "epoch": 3.6817015034836817, "step": 5020}, {"loss": 0.7376, "grad_norm": 0.7517836093902588, "learning_rate": 0.0002, "epoch": 3.689035570223689, "step": 5030}, {"loss": 0.8375, "grad_norm": 0.6878514289855957, "learning_rate": 0.0002, "epoch": 3.6963696369636962, "step": 5040}, {"loss": 0.7998, "grad_norm": 1.2371820211410522, "learning_rate": 0.0002, "epoch": 3.7037037037037037, "step": 5050}, {"loss": 0.6941, "grad_norm": 0.6567103862762451, "learning_rate": 0.0002, "epoch": 3.711037770443711, "step": 5060}, {"loss": 0.8465, "grad_norm": 1.1254922151565552, "learning_rate": 0.0002, "epoch": 3.718371837183718, "step": 5070}, {"loss": 0.8365, "grad_norm": 0.6796132326126099, "learning_rate": 0.0002, "epoch": 3.7257059039237257, "step": 5080}, {"loss": 0.7818, "grad_norm": 0.7285300493240356, "learning_rate": 0.0002, "epoch": 3.733039970663733, "step": 5090}, {"loss": 0.8581, "grad_norm": 0.8931500911712646, "learning_rate": 0.0002, "epoch": 3.7403740374037406, "step": 5100}, {"loss": 0.8181, "grad_norm": 0.6256856918334961, "learning_rate": 0.0002, "epoch": 3.7477081041437477, "step": 5110}, {"loss": 0.743, "grad_norm": 0.79310142993927, "learning_rate": 0.0002, "epoch": 3.755042170883755, "step": 5120}, {"loss": 0.8235, "grad_norm": 0.6594041585922241, "learning_rate": 0.0002, "epoch": 3.762376237623762, "step": 5130}, {"loss": 0.6925, "grad_norm": 0.7029327750205994, "learning_rate": 0.0002, "epoch": 3.7697103043637696, "step": 5140}, {"loss": 0.7457, "grad_norm": 0.5880070328712463, "learning_rate": 0.0002, "epoch": 3.777044371103777, "step": 5150}, {"loss": 0.8716, "grad_norm": 0.7578945159912109, "learning_rate": 0.0002, "epoch": 3.7843784378437846, "step": 5160}, {"loss": 0.8819, "grad_norm": 0.8276378512382507, "learning_rate": 0.0002, "epoch": 3.7917125045837916, "step": 5170}, {"loss": 0.7559, "grad_norm": 0.7627953886985779, "learning_rate": 0.0002, "epoch": 3.799046571323799, "step": 5180}, {"loss": 0.7665, "grad_norm": 0.8169086575508118, "learning_rate": 0.0002, "epoch": 3.806380638063806, "step": 5190}, {"loss": 0.761, "grad_norm": 0.6605030298233032, "learning_rate": 0.0002, "epoch": 3.8137147048038136, "step": 5200}, {"loss": 0.8804, "grad_norm": 0.5837286114692688, "learning_rate": 0.0002, "epoch": 3.821048771543821, "step": 5210}, {"loss": 0.8369, "grad_norm": 1.2422157526016235, "learning_rate": 0.0002, "epoch": 3.8283828382838285, "step": 5220}, {"loss": 0.8431, "grad_norm": 0.6589220762252808, "learning_rate": 0.0002, "epoch": 3.8357169050238356, "step": 5230}, {"loss": 0.7686, "grad_norm": 0.8567556142807007, "learning_rate": 0.0002, "epoch": 3.843050971763843, "step": 5240}, {"loss": 0.8652, "grad_norm": 0.6490627527236938, "learning_rate": 0.0002, "epoch": 3.8503850385038505, "step": 5250}, {"loss": 0.7386, "grad_norm": 0.620232880115509, "learning_rate": 0.0002, "epoch": 3.8577191052438575, "step": 5260}, {"loss": 0.9192, "grad_norm": 0.7685128450393677, "learning_rate": 0.0002, "epoch": 3.865053171983865, "step": 5270}, {"loss": 0.872, "grad_norm": 0.8113296627998352, "learning_rate": 0.0002, "epoch": 3.8723872387238725, "step": 5280}, {"loss": 0.7156, "grad_norm": 0.8092675805091858, "learning_rate": 0.0002, "epoch": 3.87972130546388, "step": 5290}, {"loss": 0.7325, "grad_norm": 0.583570122718811, "learning_rate": 0.0002, "epoch": 3.887055372203887, "step": 5300}, {"loss": 0.9333, "grad_norm": 1.712363600730896, "learning_rate": 0.0002, "epoch": 3.8943894389438944, "step": 5310}, {"loss": 0.7537, "grad_norm": 0.6673534512519836, "learning_rate": 0.0002, "epoch": 3.9017235056839015, "step": 5320}, {"loss": 0.7035, "grad_norm": 1.9770312309265137, "learning_rate": 0.0002, "epoch": 3.909057572423909, "step": 5330}, {"loss": 0.8793, "grad_norm": 0.6430999636650085, "learning_rate": 0.0002, "epoch": 3.9163916391639164, "step": 5340}, {"loss": 0.839, "grad_norm": 1.0159571170806885, "learning_rate": 0.0002, "epoch": 3.923725705903924, "step": 5350}, {"loss": 0.9332, "grad_norm": 0.8607584834098816, "learning_rate": 0.0002, "epoch": 3.931059772643931, "step": 5360}, {"loss": 0.7261, "grad_norm": 0.6967900991439819, "learning_rate": 0.0002, "epoch": 3.9383938393839384, "step": 5370}, {"loss": 0.8456, "grad_norm": 0.7683077454566956, "learning_rate": 0.0002, "epoch": 3.945727906123946, "step": 5380}, {"loss": 0.7682, "grad_norm": 0.6805762648582458, "learning_rate": 0.0002, "epoch": 3.953061972863953, "step": 5390}, {"loss": 0.7746, "grad_norm": 0.7033619284629822, "learning_rate": 0.0002, "epoch": 3.9603960396039604, "step": 5400}, {"loss": 0.8393, "grad_norm": 0.966112494468689, "learning_rate": 0.0002, "epoch": 3.967730106343968, "step": 5410}, {"loss": 0.8316, "grad_norm": 0.8467881083488464, "learning_rate": 0.0002, "epoch": 3.9750641730839753, "step": 5420}, {"loss": 0.8084, "grad_norm": 0.8005317449569702, "learning_rate": 0.0002, "epoch": 3.9823982398239823, "step": 5430}, {"loss": 0.7168, "grad_norm": 1.1615241765975952, "learning_rate": 0.0002, "epoch": 3.98973230656399, "step": 5440}, {"loss": 0.8263, "grad_norm": 0.6121614575386047, "learning_rate": 0.0002, "epoch": 3.997066373303997, "step": 5450}, {"eval_loss": 1.1834222078323364, "eval_runtime": 32.7569, "eval_samples_per_second": 13.158, "eval_steps_per_second": 1.649, "epoch": 4.0, "step": 5454}, {"loss": 0.7267, "grad_norm": 0.6055727005004883, "learning_rate": 0.0002, "epoch": 4.004400440044004, "step": 5460}, {"loss": 0.5766, "grad_norm": 0.8232647180557251, "learning_rate": 0.0002, "epoch": 4.011734506784012, "step": 5470}, {"loss": 0.6489, "grad_norm": 0.7739192247390747, "learning_rate": 0.0002, "epoch": 4.019068573524019, "step": 5480}, {"loss": 0.5978, "grad_norm": 0.6264950633049011, "learning_rate": 0.0002, "epoch": 4.026402640264027, "step": 5490}, {"loss": 0.6392, "grad_norm": 1.4798702001571655, "learning_rate": 0.0002, "epoch": 4.033736707004033, "step": 5500}, {"loss": 0.6143, "grad_norm": 0.9538470506668091, "learning_rate": 0.0002, "epoch": 4.041070773744041, "step": 5510}, {"loss": 0.6056, "grad_norm": 0.834561288356781, "learning_rate": 0.0002, "epoch": 4.048404840484048, "step": 5520}, {"loss": 0.6077, "grad_norm": 0.6407850384712219, "learning_rate": 0.0002, "epoch": 4.055738907224056, "step": 5530}, {"loss": 0.6733, "grad_norm": 0.9035961627960205, "learning_rate": 0.0002, "epoch": 4.063072973964063, "step": 5540}, {"loss": 0.5854, "grad_norm": 0.842812716960907, "learning_rate": 0.0002, "epoch": 4.070407040704071, "step": 5550}, {"loss": 0.654, "grad_norm": 0.8197882175445557, "learning_rate": 0.0002, "epoch": 4.077741107444078, "step": 5560}, {"loss": 0.5919, "grad_norm": 0.8652673959732056, "learning_rate": 0.0002, "epoch": 4.085075174184085, "step": 5570}, {"loss": 0.6188, "grad_norm": 0.8048318028450012, "learning_rate": 0.0002, "epoch": 4.092409240924092, "step": 5580}, {"loss": 0.6487, "grad_norm": 0.9604969024658203, "learning_rate": 0.0002, "epoch": 4.0997433076641, "step": 5590}, {"loss": 0.6356, "grad_norm": 1.244756817817688, "learning_rate": 0.0002, "epoch": 4.107077374404107, "step": 5600}, {"loss": 0.6489, "grad_norm": 0.7975269556045532, "learning_rate": 0.0002, "epoch": 4.114411441144115, "step": 5610}, {"loss": 0.6445, "grad_norm": 0.6130099296569824, "learning_rate": 0.0002, "epoch": 4.121745507884122, "step": 5620}, {"loss": 0.6024, "grad_norm": 0.7793202996253967, "learning_rate": 0.0002, "epoch": 4.129079574624129, "step": 5630}, {"loss": 0.5723, "grad_norm": 1.187238335609436, "learning_rate": 0.0002, "epoch": 4.136413641364136, "step": 5640}, {"loss": 0.6385, "grad_norm": 0.8450375199317932, "learning_rate": 0.0002, "epoch": 4.143747708104144, "step": 5650}, {"loss": 0.6866, "grad_norm": 0.9006940126419067, "learning_rate": 0.0002, "epoch": 4.151081774844151, "step": 5660}, {"loss": 0.6179, "grad_norm": 0.9447154998779297, "learning_rate": 0.0002, "epoch": 4.158415841584159, "step": 5670}, {"loss": 0.6476, "grad_norm": 0.798032283782959, "learning_rate": 0.0002, "epoch": 4.165749908324166, "step": 5680}, {"loss": 0.6666, "grad_norm": 0.65578693151474, "learning_rate": 0.0002, "epoch": 4.1730839750641735, "step": 5690}, {"loss": 0.701, "grad_norm": 1.0864700078964233, "learning_rate": 0.0002, "epoch": 4.18041804180418, "step": 5700}, {"loss": 0.6895, "grad_norm": 0.7344121932983398, "learning_rate": 0.0002, "epoch": 4.187752108544188, "step": 5710}, {"loss": 0.6659, "grad_norm": 0.9722456932067871, "learning_rate": 0.0002, "epoch": 4.195086175284195, "step": 5720}, {"loss": 0.6887, "grad_norm": 1.263814926147461, "learning_rate": 0.0002, "epoch": 4.2024202420242025, "step": 5730}, {"loss": 0.608, "grad_norm": 0.9622581005096436, "learning_rate": 0.0002, "epoch": 4.20975430876421, "step": 5740}, {"loss": 0.6221, "grad_norm": 0.8497143387794495, "learning_rate": 0.0002, "epoch": 4.2170883755042174, "step": 5750}, {"loss": 0.6322, "grad_norm": 0.8248446583747864, "learning_rate": 0.0002, "epoch": 4.224422442244224, "step": 5760}, {"loss": 0.6045, "grad_norm": 1.2544798851013184, "learning_rate": 0.0002, "epoch": 4.2317565089842315, "step": 5770}, {"loss": 0.641, "grad_norm": 0.8224676251411438, "learning_rate": 0.0002, "epoch": 4.239090575724239, "step": 5780}, {"loss": 0.6399, "grad_norm": 0.8924877047538757, "learning_rate": 0.0002, "epoch": 4.2464246424642464, "step": 5790}, {"loss": 0.6845, "grad_norm": 0.8545848727226257, "learning_rate": 0.0002, "epoch": 4.253758709204254, "step": 5800}, {"loss": 0.6669, "grad_norm": 0.8081067800521851, "learning_rate": 0.0002, "epoch": 4.261092775944261, "step": 5810}, {"loss": 0.6149, "grad_norm": 0.7111002802848816, "learning_rate": 0.0002, "epoch": 4.268426842684269, "step": 5820}, {"loss": 0.6343, "grad_norm": 0.8696979880332947, "learning_rate": 0.0002, "epoch": 4.2757609094242754, "step": 5830}, {"loss": 0.6384, "grad_norm": 0.821401834487915, "learning_rate": 0.0002, "epoch": 4.283094976164283, "step": 5840}, {"loss": 0.6912, "grad_norm": 0.888908326625824, "learning_rate": 0.0002, "epoch": 4.29042904290429, "step": 5850}, {"loss": 0.6061, "grad_norm": 1.9380123615264893, "learning_rate": 0.0002, "epoch": 4.297763109644298, "step": 5860}, {"loss": 0.6766, "grad_norm": 1.121774435043335, "learning_rate": 0.0002, "epoch": 4.305097176384305, "step": 5870}, {"loss": 0.7205, "grad_norm": 0.9238282442092896, "learning_rate": 0.0002, "epoch": 4.312431243124313, "step": 5880}, {"loss": 0.6351, "grad_norm": 0.7321620583534241, "learning_rate": 0.0002, "epoch": 4.319765309864319, "step": 5890}, {"loss": 0.6404, "grad_norm": 0.8739548325538635, "learning_rate": 0.0002, "epoch": 4.327099376604327, "step": 5900}, {"loss": 0.5892, "grad_norm": 0.9686012268066406, "learning_rate": 0.0002, "epoch": 4.334433443344334, "step": 5910}, {"loss": 0.641, "grad_norm": 0.9033839106559753, "learning_rate": 0.0002, "epoch": 4.341767510084342, "step": 5920}, {"loss": 0.6456, "grad_norm": 0.8131115436553955, "learning_rate": 0.0002, "epoch": 4.349101576824349, "step": 5930}, {"loss": 0.5826, "grad_norm": 0.8942412734031677, "learning_rate": 0.0002, "epoch": 4.356435643564357, "step": 5940}, {"loss": 0.7336, "grad_norm": 0.8439112901687622, "learning_rate": 0.0002, "epoch": 4.363769710304364, "step": 5950}, {"loss": 0.6537, "grad_norm": 0.9176713228225708, "learning_rate": 0.0002, "epoch": 4.371103777044371, "step": 5960}, {"loss": 0.6792, "grad_norm": 0.6799634695053101, "learning_rate": 0.0002, "epoch": 4.378437843784378, "step": 5970}, {"loss": 0.7266, "grad_norm": 1.0435824394226074, "learning_rate": 0.0002, "epoch": 4.385771910524386, "step": 5980}, {"loss": 0.68, "grad_norm": 0.997937798500061, "learning_rate": 0.0002, "epoch": 4.393105977264393, "step": 5990}, {"loss": 0.6604, "grad_norm": 1.0308842658996582, "learning_rate": 0.0002, "epoch": 4.400440044004401, "step": 6000}, {"loss": 0.6402, "grad_norm": 1.3683775663375854, "learning_rate": 0.0002, "epoch": 4.407774110744408, "step": 6010}, {"loss": 0.7027, "grad_norm": 0.7569534182548523, "learning_rate": 0.0002, "epoch": 4.415108177484415, "step": 6020}, {"loss": 0.5949, "grad_norm": 1.089978575706482, "learning_rate": 0.0002, "epoch": 4.422442244224422, "step": 6030}, {"loss": 0.6353, "grad_norm": 0.7522459626197815, "learning_rate": 0.0002, "epoch": 4.42977631096443, "step": 6040}, {"loss": 0.5852, "grad_norm": 0.6709823608398438, "learning_rate": 0.0002, "epoch": 4.437110377704437, "step": 6050}, {"loss": 0.6718, "grad_norm": 0.6992089748382568, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 6060}, {"loss": 0.6933, "grad_norm": 1.0182931423187256, "learning_rate": 0.0002, "epoch": 4.451778511184452, "step": 6070}, {"loss": 0.6255, "grad_norm": 1.0685160160064697, "learning_rate": 0.0002, "epoch": 4.459112577924459, "step": 6080}, {"loss": 0.6086, "grad_norm": 0.8295124769210815, "learning_rate": 0.0002, "epoch": 4.466446644664466, "step": 6090}, {"loss": 0.6359, "grad_norm": 1.1862998008728027, "learning_rate": 0.0002, "epoch": 4.473780711404474, "step": 6100}, {"loss": 0.638, "grad_norm": 0.7400273084640503, "learning_rate": 0.0002, "epoch": 4.481114778144481, "step": 6110}, {"loss": 0.6854, "grad_norm": 0.7098417282104492, "learning_rate": 0.0002, "epoch": 4.488448844884489, "step": 6120}, {"loss": 0.6976, "grad_norm": 0.9745053648948669, "learning_rate": 0.0002, "epoch": 4.495782911624496, "step": 6130}, {"loss": 0.605, "grad_norm": 0.8638797998428345, "learning_rate": 0.0002, "epoch": 4.503116978364503, "step": 6140}, {"loss": 0.6491, "grad_norm": 0.8291046619415283, "learning_rate": 0.0002, "epoch": 4.51045104510451, "step": 6150}, {"loss": 0.6457, "grad_norm": 1.0301737785339355, "learning_rate": 0.0002, "epoch": 4.517785111844518, "step": 6160}, {"loss": 0.6742, "grad_norm": 1.1996512413024902, "learning_rate": 0.0002, "epoch": 4.525119178584525, "step": 6170}, {"loss": 0.6484, "grad_norm": 1.151038408279419, "learning_rate": 0.0002, "epoch": 4.5324532453245325, "step": 6180}, {"loss": 0.668, "grad_norm": 0.8385201096534729, "learning_rate": 0.0002, "epoch": 4.53978731206454, "step": 6190}, {"loss": 0.6381, "grad_norm": 0.8969188332557678, "learning_rate": 0.0002, "epoch": 4.5471213788045475, "step": 6200}, {"loss": 0.7141, "grad_norm": 1.60659658908844, "learning_rate": 0.0002, "epoch": 4.554455445544555, "step": 6210}, {"loss": 0.6388, "grad_norm": 0.9356731176376343, "learning_rate": 0.0002, "epoch": 4.5617895122845615, "step": 6220}, {"loss": 0.7393, "grad_norm": 0.95856773853302, "learning_rate": 0.0002, "epoch": 4.569123579024569, "step": 6230}, {"loss": 0.6554, "grad_norm": 1.1162524223327637, "learning_rate": 0.0002, "epoch": 4.5764576457645765, "step": 6240}, {"loss": 0.6012, "grad_norm": 0.8809238076210022, "learning_rate": 0.0002, "epoch": 4.583791712504584, "step": 6250}, {"loss": 0.648, "grad_norm": 0.890738844871521, "learning_rate": 0.0002, "epoch": 4.591125779244591, "step": 6260}, {"loss": 0.6663, "grad_norm": 0.918684720993042, "learning_rate": 0.0002, "epoch": 4.598459845984598, "step": 6270}, {"loss": 0.5992, "grad_norm": 0.8156296610832214, "learning_rate": 0.0002, "epoch": 4.6057939127246055, "step": 6280}, {"loss": 0.723, "grad_norm": 1.046634316444397, "learning_rate": 0.0002, "epoch": 4.613127979464613, "step": 6290}, {"loss": 0.7023, "grad_norm": 0.7725525498390198, "learning_rate": 0.0002, "epoch": 4.62046204620462, "step": 6300}, {"loss": 0.6414, "grad_norm": 0.9992046356201172, "learning_rate": 0.0002, "epoch": 4.627796112944628, "step": 6310}, {"loss": 0.6201, "grad_norm": 0.8480095267295837, "learning_rate": 0.0002, "epoch": 4.635130179684635, "step": 6320}, {"loss": 0.6869, "grad_norm": 0.7061955332756042, "learning_rate": 0.0002, "epoch": 4.642464246424643, "step": 6330}, {"loss": 0.6828, "grad_norm": 1.0354212522506714, "learning_rate": 0.0002, "epoch": 4.649798313164649, "step": 6340}, {"loss": 0.6651, "grad_norm": 1.0081377029418945, "learning_rate": 0.0002, "epoch": 4.657132379904657, "step": 6350}, {"loss": 0.726, "grad_norm": 1.2904249429702759, "learning_rate": 0.0002, "epoch": 4.664466446644664, "step": 6360}, {"loss": 0.7148, "grad_norm": 0.9248910546302795, "learning_rate": 0.0002, "epoch": 4.671800513384672, "step": 6370}, {"loss": 0.6961, "grad_norm": 0.9907804131507874, "learning_rate": 0.0002, "epoch": 4.679134580124679, "step": 6380}, {"loss": 0.6163, "grad_norm": 1.201143741607666, "learning_rate": 0.0002, "epoch": 4.686468646864687, "step": 6390}, {"loss": 0.6762, "grad_norm": 0.8709394335746765, "learning_rate": 0.0002, "epoch": 4.693802713604693, "step": 6400}, {"loss": 0.7217, "grad_norm": 0.7468608021736145, "learning_rate": 0.0002, "epoch": 4.701136780344701, "step": 6410}, {"loss": 0.6548, "grad_norm": 0.8607903718948364, "learning_rate": 0.0002, "epoch": 4.708470847084708, "step": 6420}, {"loss": 0.6449, "grad_norm": 0.9840512871742249, "learning_rate": 0.0002, "epoch": 4.715804913824716, "step": 6430}, {"loss": 0.685, "grad_norm": 0.8328204154968262, "learning_rate": 0.0002, "epoch": 4.723138980564723, "step": 6440}, {"loss": 0.697, "grad_norm": 0.924505352973938, "learning_rate": 0.0002, "epoch": 4.730473047304731, "step": 6450}, {"loss": 0.7422, "grad_norm": 0.8897685408592224, "learning_rate": 0.0002, "epoch": 4.737807114044738, "step": 6460}, {"loss": 0.6842, "grad_norm": 0.9605024456977844, "learning_rate": 0.0002, "epoch": 4.745141180784745, "step": 6470}, {"loss": 0.6488, "grad_norm": 0.8150759935379028, "learning_rate": 0.0002, "epoch": 4.752475247524752, "step": 6480}, {"loss": 0.6606, "grad_norm": 0.8128412961959839, "learning_rate": 0.0002, "epoch": 4.75980931426476, "step": 6490}, {"loss": 0.6729, "grad_norm": 0.7381404638290405, "learning_rate": 0.0002, "epoch": 4.767143381004767, "step": 6500}, {"loss": 0.6713, "grad_norm": 1.0565853118896484, "learning_rate": 0.0002, "epoch": 4.774477447744775, "step": 6510}, {"loss": 0.6496, "grad_norm": 0.9298134446144104, "learning_rate": 0.0002, "epoch": 4.781811514484782, "step": 6520}, {"loss": 0.7279, "grad_norm": 1.0145525932312012, "learning_rate": 0.0002, "epoch": 4.789145581224789, "step": 6530}, {"loss": 0.5986, "grad_norm": 0.92259681224823, "learning_rate": 0.0002, "epoch": 4.796479647964796, "step": 6540}, {"loss": 0.63, "grad_norm": 0.7881024479866028, "learning_rate": 0.0002, "epoch": 4.803813714704804, "step": 6550}, {"loss": 0.7134, "grad_norm": 1.4935206174850464, "learning_rate": 0.0002, "epoch": 4.811147781444811, "step": 6560}, {"loss": 0.6695, "grad_norm": 0.8612369298934937, "learning_rate": 0.0002, "epoch": 4.818481848184819, "step": 6570}, {"loss": 0.779, "grad_norm": 1.0118653774261475, "learning_rate": 0.0002, "epoch": 4.825815914924826, "step": 6580}, {"loss": 0.6991, "grad_norm": 1.1303809881210327, "learning_rate": 0.0002, "epoch": 4.833149981664834, "step": 6590}, {"loss": 0.7887, "grad_norm": 0.9112492203712463, "learning_rate": 0.0002, "epoch": 4.84048404840484, "step": 6600}, {"loss": 0.7699, "grad_norm": 0.864762544631958, "learning_rate": 0.0002, "epoch": 4.847818115144848, "step": 6610}, {"loss": 0.7347, "grad_norm": 0.9090572595596313, "learning_rate": 0.0002, "epoch": 4.855152181884855, "step": 6620}, {"loss": 0.6608, "grad_norm": 1.014953374862671, "learning_rate": 0.0002, "epoch": 4.862486248624863, "step": 6630}, {"loss": 0.6429, "grad_norm": 1.0702149868011475, "learning_rate": 0.0002, "epoch": 4.86982031536487, "step": 6640}, {"loss": 0.6943, "grad_norm": 1.002135157585144, "learning_rate": 0.0002, "epoch": 4.8771543821048775, "step": 6650}, {"loss": 0.7225, "grad_norm": 0.862545907497406, "learning_rate": 0.0002, "epoch": 4.884488448844884, "step": 6660}, {"loss": 0.6206, "grad_norm": 0.7302131056785583, "learning_rate": 0.0002, "epoch": 4.891822515584892, "step": 6670}, {"loss": 0.7175, "grad_norm": 0.8380730152130127, "learning_rate": 0.0002, "epoch": 4.899156582324899, "step": 6680}, {"loss": 0.645, "grad_norm": 0.7956018447875977, "learning_rate": 0.0002, "epoch": 4.9064906490649065, "step": 6690}, {"loss": 0.6431, "grad_norm": 0.6717583537101746, "learning_rate": 0.0002, "epoch": 4.913824715804914, "step": 6700}, {"loss": 0.6942, "grad_norm": 1.09099280834198, "learning_rate": 0.0002, "epoch": 4.9211587825449215, "step": 6710}, {"loss": 0.7533, "grad_norm": 0.8589889407157898, "learning_rate": 0.0002, "epoch": 4.928492849284929, "step": 6720}, {"loss": 0.66, "grad_norm": 1.0046314001083374, "learning_rate": 0.0002, "epoch": 4.9358269160249355, "step": 6730}, {"loss": 0.6864, "grad_norm": 0.8559659123420715, "learning_rate": 0.0002, "epoch": 4.943160982764943, "step": 6740}, {"loss": 0.6847, "grad_norm": 0.8588525652885437, "learning_rate": 0.0002, "epoch": 4.9504950495049505, "step": 6750}, {"loss": 0.6428, "grad_norm": 0.9192708134651184, "learning_rate": 0.0002, "epoch": 4.957829116244958, "step": 6760}, {"loss": 0.6873, "grad_norm": 1.051398754119873, "learning_rate": 0.0002, "epoch": 4.965163182984965, "step": 6770}, {"loss": 0.7249, "grad_norm": 0.9111362099647522, "learning_rate": 0.0002, "epoch": 4.972497249724973, "step": 6780}, {"loss": 0.7613, "grad_norm": 0.7305638194084167, "learning_rate": 0.0002, "epoch": 4.9798313164649795, "step": 6790}, {"loss": 0.6747, "grad_norm": 1.118837594985962, "learning_rate": 0.0002, "epoch": 4.987165383204987, "step": 6800}, {"loss": 0.6412, "grad_norm": 0.9075239300727844, "learning_rate": 0.0002, "epoch": 4.994499449944994, "step": 6810}, {"eval_loss": 1.2361247539520264, "eval_runtime": 32.7325, "eval_samples_per_second": 13.167, "eval_steps_per_second": 1.65, "epoch": 4.999633296662999, "step": 6817}, {"loss": 0.7091, "grad_norm": 1.0541315078735352, "learning_rate": 0.0002, "epoch": 5.001833516685002, "step": 6820}, {"loss": 0.4882, "grad_norm": 0.9750140905380249, "learning_rate": 0.0002, "epoch": 5.009167583425009, "step": 6830}, {"loss": 0.6022, "grad_norm": 0.931838870048523, "learning_rate": 0.0002, "epoch": 5.016501650165017, "step": 6840}, {"loss": 0.5194, "grad_norm": 1.110278844833374, "learning_rate": 0.0002, "epoch": 5.023835716905023, "step": 6850}, {"loss": 0.4676, "grad_norm": 1.0670180320739746, "learning_rate": 0.0002, "epoch": 5.031169783645031, "step": 6860}, {"loss": 0.4374, "grad_norm": 0.8762092590332031, "learning_rate": 0.0002, "epoch": 5.038503850385038, "step": 6870}, {"loss": 0.505, "grad_norm": 1.1169432401657104, "learning_rate": 0.0002, "epoch": 5.045837917125046, "step": 6880}, {"loss": 0.5114, "grad_norm": 1.005491018295288, "learning_rate": 0.0002, "epoch": 5.053171983865053, "step": 6890}, {"loss": 0.5221, "grad_norm": 1.1751841306686401, "learning_rate": 0.0002, "epoch": 5.060506050605061, "step": 6900}, {"loss": 0.451, "grad_norm": 0.8501367568969727, "learning_rate": 0.0002, "epoch": 5.067840117345068, "step": 6910}, {"loss": 0.5292, "grad_norm": 0.9795131683349609, "learning_rate": 0.0002, "epoch": 5.075174184085075, "step": 6920}, {"loss": 0.5234, "grad_norm": 0.8929879665374756, "learning_rate": 0.0002, "epoch": 5.082508250825082, "step": 6930}, {"loss": 0.5378, "grad_norm": 1.0156651735305786, "learning_rate": 0.0002, "epoch": 5.08984231756509, "step": 6940}, {"loss": 0.5241, "grad_norm": 1.0974335670471191, "learning_rate": 0.0002, "epoch": 5.097176384305097, "step": 6950}, {"loss": 0.5705, "grad_norm": 1.7015666961669922, "learning_rate": 0.0002, "epoch": 5.104510451045105, "step": 6960}, {"loss": 0.523, "grad_norm": 1.0343226194381714, "learning_rate": 0.0002, "epoch": 5.111844517785112, "step": 6970}, {"loss": 0.4616, "grad_norm": 1.3072983026504517, "learning_rate": 0.0002, "epoch": 5.119178584525119, "step": 6980}, {"loss": 0.4813, "grad_norm": 1.038986086845398, "learning_rate": 0.0002, "epoch": 5.126512651265126, "step": 6990}, {"loss": 0.4616, "grad_norm": 0.8638386130332947, "learning_rate": 0.0002, "epoch": 5.133846718005134, "step": 7000}, {"loss": 0.5294, "grad_norm": 0.8326523900032043, "learning_rate": 0.0002, "epoch": 5.141180784745141, "step": 7010}, {"loss": 0.5021, "grad_norm": 1.0976895093917847, "learning_rate": 0.0002, "epoch": 5.148514851485149, "step": 7020}, {"loss": 0.4677, "grad_norm": 1.0077873468399048, "learning_rate": 0.0002, "epoch": 5.155848918225156, "step": 7030}, {"loss": 0.5262, "grad_norm": 1.0662257671356201, "learning_rate": 0.0002, "epoch": 5.163182984965164, "step": 7040}, {"loss": 0.5484, "grad_norm": 1.206271767616272, "learning_rate": 0.0002, "epoch": 5.17051705170517, "step": 7050}, {"loss": 0.4817, "grad_norm": 1.1990262269973755, "learning_rate": 0.0002, "epoch": 5.177851118445178, "step": 7060}, {"loss": 0.6048, "grad_norm": 1.0207163095474243, "learning_rate": 0.0002, "epoch": 5.185185185185185, "step": 7070}, {"loss": 0.4816, "grad_norm": 1.2783987522125244, "learning_rate": 0.0002, "epoch": 5.192519251925193, "step": 7080}, {"loss": 0.5322, "grad_norm": 1.1592512130737305, "learning_rate": 0.0002, "epoch": 5.1998533186652, "step": 7090}, {"loss": 0.5472, "grad_norm": 1.1053160429000854, "learning_rate": 0.0002, "epoch": 5.2071873854052075, "step": 7100}, {"loss": 0.4986, "grad_norm": 1.1925510168075562, "learning_rate": 0.0002, "epoch": 5.214521452145214, "step": 7110}, {"loss": 0.5065, "grad_norm": 1.0714877843856812, "learning_rate": 0.0002, "epoch": 5.221855518885222, "step": 7120}, {"loss": 0.5209, "grad_norm": 0.9451011419296265, "learning_rate": 0.0002, "epoch": 5.229189585625229, "step": 7130}, {"loss": 0.5298, "grad_norm": 1.03838050365448, "learning_rate": 0.0002, "epoch": 5.2365236523652365, "step": 7140}, {"loss": 0.4848, "grad_norm": 0.9204146265983582, "learning_rate": 0.0002, "epoch": 5.243857719105244, "step": 7150}, {"loss": 0.5164, "grad_norm": 1.0142229795455933, "learning_rate": 0.0002, "epoch": 5.2511917858452515, "step": 7160}, {"loss": 0.5092, "grad_norm": 1.4432005882263184, "learning_rate": 0.0002, "epoch": 5.258525852585258, "step": 7170}, {"loss": 0.5133, "grad_norm": 1.1239633560180664, "learning_rate": 0.0002, "epoch": 5.2658599193252655, "step": 7180}, {"loss": 0.4969, "grad_norm": 0.7012821435928345, "learning_rate": 0.0002, "epoch": 5.273193986065273, "step": 7190}, {"loss": 0.5466, "grad_norm": 1.3499128818511963, "learning_rate": 0.0002, "epoch": 5.2805280528052805, "step": 7200}, {"loss": 0.5282, "grad_norm": 0.9498730897903442, "learning_rate": 0.0002, "epoch": 5.287862119545288, "step": 7210}, {"loss": 0.5051, "grad_norm": 0.9552369117736816, "learning_rate": 0.0002, "epoch": 5.295196186285295, "step": 7220}, {"loss": 0.5329, "grad_norm": 0.7610348463058472, "learning_rate": 0.0002, "epoch": 5.302530253025303, "step": 7230}, {"loss": 0.468, "grad_norm": 1.0314512252807617, "learning_rate": 0.0002, "epoch": 5.3098643197653095, "step": 7240}, {"loss": 0.5367, "grad_norm": 1.0534334182739258, "learning_rate": 0.0002, "epoch": 5.317198386505317, "step": 7250}, {"loss": 0.5491, "grad_norm": 1.2553406953811646, "learning_rate": 0.0002, "epoch": 5.324532453245324, "step": 7260}, {"loss": 0.5218, "grad_norm": 0.7061691880226135, "learning_rate": 0.0002, "epoch": 5.331866519985332, "step": 7270}, {"loss": 0.5625, "grad_norm": 0.9652578830718994, "learning_rate": 0.0002, "epoch": 5.339200586725339, "step": 7280}, {"loss": 0.5608, "grad_norm": 1.114788293838501, "learning_rate": 0.0002, "epoch": 5.346534653465347, "step": 7290}, {"loss": 0.578, "grad_norm": 1.0940049886703491, "learning_rate": 0.0002, "epoch": 5.353868720205353, "step": 7300}, {"loss": 0.5256, "grad_norm": 1.0151008367538452, "learning_rate": 0.0002, "epoch": 5.361202786945361, "step": 7310}, {"loss": 0.5377, "grad_norm": 1.0369552373886108, "learning_rate": 0.0002, "epoch": 5.368536853685368, "step": 7320}, {"loss": 0.5028, "grad_norm": 0.8489866256713867, "learning_rate": 0.0002, "epoch": 5.375870920425376, "step": 7330}, {"loss": 0.5937, "grad_norm": 1.1031713485717773, "learning_rate": 0.0002, "epoch": 5.383204987165383, "step": 7340}, {"loss": 0.5355, "grad_norm": 0.9094716310501099, "learning_rate": 0.0002, "epoch": 5.390539053905391, "step": 7350}, {"loss": 0.5406, "grad_norm": 0.9530431032180786, "learning_rate": 0.0002, "epoch": 5.397873120645398, "step": 7360}, {"loss": 0.529, "grad_norm": 0.9633604884147644, "learning_rate": 0.0002, "epoch": 5.405207187385405, "step": 7370}, {"loss": 0.5315, "grad_norm": 0.9541662335395813, "learning_rate": 0.0002, "epoch": 5.412541254125412, "step": 7380}, {"loss": 0.6774, "grad_norm": 1.0459771156311035, "learning_rate": 0.0002, "epoch": 5.41987532086542, "step": 7390}, {"loss": 0.5737, "grad_norm": 1.027388334274292, "learning_rate": 0.0002, "epoch": 5.427209387605427, "step": 7400}, {"loss": 0.556, "grad_norm": 0.7267653346061707, "learning_rate": 0.0002, "epoch": 5.434543454345435, "step": 7410}, {"loss": 0.4581, "grad_norm": 1.020142674446106, "learning_rate": 0.0002, "epoch": 5.441877521085442, "step": 7420}, {"loss": 0.4853, "grad_norm": 1.044754147529602, "learning_rate": 0.0002, "epoch": 5.449211587825449, "step": 7430}, {"loss": 0.5666, "grad_norm": 1.5476195812225342, "learning_rate": 0.0002, "epoch": 5.456545654565456, "step": 7440}, {"loss": 0.5302, "grad_norm": 0.9879506826400757, "learning_rate": 0.0002, "epoch": 5.463879721305464, "step": 7450}, {"loss": 0.591, "grad_norm": 1.2562980651855469, "learning_rate": 0.0002, "epoch": 5.471213788045471, "step": 7460}, {"loss": 0.5188, "grad_norm": 1.3051384687423706, "learning_rate": 0.0002, "epoch": 5.478547854785479, "step": 7470}, {"loss": 0.5658, "grad_norm": 1.0511597394943237, "learning_rate": 0.0002, "epoch": 5.485881921525486, "step": 7480}, {"loss": 0.6327, "grad_norm": 1.0380817651748657, "learning_rate": 0.0002, "epoch": 5.493215988265494, "step": 7490}, {"loss": 0.5356, "grad_norm": 1.170274257659912, "learning_rate": 0.0002, "epoch": 5.5005500550055, "step": 7500}, {"loss": 0.5405, "grad_norm": 1.3356517553329468, "learning_rate": 0.0002, "epoch": 5.507884121745508, "step": 7510}, {"loss": 0.5305, "grad_norm": 1.0727124214172363, "learning_rate": 0.0002, "epoch": 5.515218188485515, "step": 7520}, {"loss": 0.5543, "grad_norm": 1.0110199451446533, "learning_rate": 0.0002, "epoch": 5.522552255225523, "step": 7530}, {"loss": 0.5962, "grad_norm": 1.3086743354797363, "learning_rate": 0.0002, "epoch": 5.52988632196553, "step": 7540}, {"loss": 0.5512, "grad_norm": 1.1904916763305664, "learning_rate": 0.0002, "epoch": 5.537220388705538, "step": 7550}, {"loss": 0.5915, "grad_norm": 0.9466280937194824, "learning_rate": 0.0002, "epoch": 5.544554455445544, "step": 7560}, {"loss": 0.5573, "grad_norm": 1.1237901449203491, "learning_rate": 0.0002, "epoch": 5.551888522185552, "step": 7570}, {"loss": 0.5383, "grad_norm": 0.9590660333633423, "learning_rate": 0.0002, "epoch": 5.559222588925559, "step": 7580}, {"loss": 0.5594, "grad_norm": 1.0890778303146362, "learning_rate": 0.0002, "epoch": 5.566556655665567, "step": 7590}, {"loss": 0.5698, "grad_norm": 0.7206931114196777, "learning_rate": 0.0002, "epoch": 5.573890722405574, "step": 7600}, {"loss": 0.5511, "grad_norm": 1.2884514331817627, "learning_rate": 0.0002, "epoch": 5.5812247891455815, "step": 7610}, {"loss": 0.5279, "grad_norm": 0.7798039317131042, "learning_rate": 0.0002, "epoch": 5.588558855885589, "step": 7620}, {"loss": 0.4847, "grad_norm": 1.166046142578125, "learning_rate": 0.0002, "epoch": 5.595892922625596, "step": 7630}, {"loss": 0.5821, "grad_norm": 1.0150201320648193, "learning_rate": 0.0002, "epoch": 5.603226989365603, "step": 7640}, {"loss": 0.5296, "grad_norm": 1.0449682474136353, "learning_rate": 0.0002, "epoch": 5.6105610561056105, "step": 7650}, {"loss": 0.5431, "grad_norm": 0.9310530424118042, "learning_rate": 0.0002, "epoch": 5.617895122845618, "step": 7660}, {"loss": 0.5234, "grad_norm": 0.9117933511734009, "learning_rate": 0.0002, "epoch": 5.6252291895856255, "step": 7670}, {"loss": 0.5807, "grad_norm": 1.1475164890289307, "learning_rate": 0.0002, "epoch": 5.632563256325633, "step": 7680}, {"loss": 0.5816, "grad_norm": 1.066809058189392, "learning_rate": 0.0002, "epoch": 5.6398973230656395, "step": 7690}, {"loss": 0.551, "grad_norm": 1.2834991216659546, "learning_rate": 0.0002, "epoch": 5.647231389805647, "step": 7700}, {"loss": 0.5914, "grad_norm": 1.2245112657546997, "learning_rate": 0.0002, "epoch": 5.6545654565456545, "step": 7710}, {"loss": 0.5552, "grad_norm": 1.1424106359481812, "learning_rate": 0.0002, "epoch": 5.661899523285662, "step": 7720}, {"loss": 0.559, "grad_norm": 1.0673892498016357, "learning_rate": 0.0002, "epoch": 5.669233590025669, "step": 7730}, {"loss": 0.544, "grad_norm": 1.4312121868133545, "learning_rate": 0.0002, "epoch": 5.676567656765677, "step": 7740}, {"loss": 0.5576, "grad_norm": 0.9976982474327087, "learning_rate": 0.0002, "epoch": 5.683901723505684, "step": 7750}, {"loss": 0.4855, "grad_norm": 0.9464678168296814, "learning_rate": 0.0002, "epoch": 5.691235790245691, "step": 7760}, {"loss": 0.5363, "grad_norm": 1.010995626449585, "learning_rate": 0.0002, "epoch": 5.698569856985698, "step": 7770}, {"loss": 0.5873, "grad_norm": 1.3787750005722046, "learning_rate": 0.0002, "epoch": 5.705903923725706, "step": 7780}, {"loss": 0.6234, "grad_norm": 1.020922303199768, "learning_rate": 0.0002, "epoch": 5.713237990465713, "step": 7790}, {"loss": 0.5337, "grad_norm": 0.9748636484146118, "learning_rate": 0.0002, "epoch": 5.720572057205721, "step": 7800}, {"loss": 0.5507, "grad_norm": 1.3077744245529175, "learning_rate": 0.0002, "epoch": 5.727906123945728, "step": 7810}, {"loss": 0.558, "grad_norm": 1.4770057201385498, "learning_rate": 0.0002, "epoch": 5.735240190685735, "step": 7820}, {"loss": 0.5571, "grad_norm": 1.6349090337753296, "learning_rate": 0.0002, "epoch": 5.742574257425742, "step": 7830}, {"loss": 0.5056, "grad_norm": 0.9818630814552307, "learning_rate": 0.0002, "epoch": 5.74990832416575, "step": 7840}, {"loss": 0.5495, "grad_norm": 0.9659715890884399, "learning_rate": 0.0002, "epoch": 5.757242390905757, "step": 7850}, {"loss": 0.5628, "grad_norm": 0.9269950985908508, "learning_rate": 0.0002, "epoch": 5.764576457645765, "step": 7860}, {"loss": 0.5594, "grad_norm": 1.0099073648452759, "learning_rate": 0.0002, "epoch": 5.771910524385772, "step": 7870}, {"loss": 0.5912, "grad_norm": 0.9123615026473999, "learning_rate": 0.0002, "epoch": 5.77924459112578, "step": 7880}, {"loss": 0.6054, "grad_norm": 1.1542246341705322, "learning_rate": 0.0002, "epoch": 5.786578657865786, "step": 7890}, {"loss": 0.5829, "grad_norm": 1.0792022943496704, "learning_rate": 0.0002, "epoch": 5.793912724605794, "step": 7900}, {"loss": 0.504, "grad_norm": 0.95615553855896, "learning_rate": 0.0002, "epoch": 5.801246791345801, "step": 7910}, {"loss": 0.5918, "grad_norm": 1.2471332550048828, "learning_rate": 0.0002, "epoch": 5.808580858085809, "step": 7920}, {"loss": 0.5719, "grad_norm": 1.0189851522445679, "learning_rate": 0.0002, "epoch": 5.815914924825816, "step": 7930}, {"loss": 0.5958, "grad_norm": 1.3309742212295532, "learning_rate": 0.0002, "epoch": 5.823248991565823, "step": 7940}, {"loss": 0.6255, "grad_norm": 1.2930549383163452, "learning_rate": 0.0002, "epoch": 5.83058305830583, "step": 7950}, {"loss": 0.5301, "grad_norm": 0.8216308951377869, "learning_rate": 0.0002, "epoch": 5.837917125045838, "step": 7960}, {"loss": 0.5397, "grad_norm": 1.1205775737762451, "learning_rate": 0.0002, "epoch": 5.845251191785845, "step": 7970}, {"loss": 0.5903, "grad_norm": 0.851298451423645, "learning_rate": 0.0002, "epoch": 5.852585258525853, "step": 7980}, {"loss": 0.5981, "grad_norm": 0.8797095417976379, "learning_rate": 0.0002, "epoch": 5.85991932526586, "step": 7990}, {"loss": 0.6106, "grad_norm": 1.5784614086151123, "learning_rate": 0.0002, "epoch": 5.867253392005868, "step": 8000}, {"loss": 0.5956, "grad_norm": 1.1531187295913696, "learning_rate": 0.0002, "epoch": 5.874587458745875, "step": 8010}, {"loss": 0.6289, "grad_norm": 1.2469146251678467, "learning_rate": 0.0002, "epoch": 5.881921525485882, "step": 8020}, {"loss": 0.5827, "grad_norm": 1.0784350633621216, "learning_rate": 0.0002, "epoch": 5.889255592225889, "step": 8030}, {"loss": 0.6339, "grad_norm": 1.1311599016189575, "learning_rate": 0.0002, "epoch": 5.896589658965897, "step": 8040}, {"loss": 0.5815, "grad_norm": 0.9654512405395508, "learning_rate": 0.0002, "epoch": 5.903923725705904, "step": 8050}, {"loss": 0.6198, "grad_norm": 1.3288270235061646, "learning_rate": 0.0002, "epoch": 5.9112577924459115, "step": 8060}, {"loss": 0.6515, "grad_norm": 1.12800931930542, "learning_rate": 0.0002, "epoch": 5.918591859185918, "step": 8070}, {"loss": 0.5684, "grad_norm": 0.9449917674064636, "learning_rate": 0.0002, "epoch": 5.925925925925926, "step": 8080}, {"loss": 0.6063, "grad_norm": 1.1532357931137085, "learning_rate": 0.0002, "epoch": 5.933259992665933, "step": 8090}, {"loss": 0.5318, "grad_norm": 1.2211151123046875, "learning_rate": 0.0002, "epoch": 5.9405940594059405, "step": 8100}, {"loss": 0.6512, "grad_norm": 1.3459105491638184, "learning_rate": 0.0002, "epoch": 5.947928126145948, "step": 8110}, {"loss": 0.5952, "grad_norm": 1.251999855041504, "learning_rate": 0.0002, "epoch": 5.9552621928859555, "step": 8120}, {"loss": 0.6203, "grad_norm": 1.5682506561279297, "learning_rate": 0.0002, "epoch": 5.962596259625963, "step": 8130}, {"loss": 0.6253, "grad_norm": 0.926075279712677, "learning_rate": 0.0002, "epoch": 5.9699303263659695, "step": 8140}, {"loss": 0.5545, "grad_norm": 0.9622511863708496, "learning_rate": 0.0002, "epoch": 5.977264393105977, "step": 8150}, {"loss": 0.5518, "grad_norm": 0.9633373618125916, "learning_rate": 0.0002, "epoch": 5.9845984598459845, "step": 8160}, {"loss": 0.5831, "grad_norm": 0.8960476517677307, "learning_rate": 0.0002, "epoch": 5.991932526585992, "step": 8170}, {"loss": 0.5442, "grad_norm": 0.9372805953025818, "learning_rate": 0.0002, "epoch": 5.999266593325999, "step": 8180}, {"eval_loss": 1.3233846426010132, "eval_runtime": 32.7419, "eval_samples_per_second": 13.164, "eval_steps_per_second": 1.649, "epoch": 6.0, "step": 8181}, {"loss": 0.4644, "grad_norm": 1.1900787353515625, "learning_rate": 0.0002, "epoch": 6.006600660066007, "step": 8190}, {"loss": 0.4509, "grad_norm": 1.1448326110839844, "learning_rate": 0.0002, "epoch": 6.013934726806014, "step": 8200}, {"loss": 0.3667, "grad_norm": 1.1848368644714355, "learning_rate": 0.0002, "epoch": 6.021268793546021, "step": 8210}, {"loss": 0.4315, "grad_norm": 1.2315572500228882, "learning_rate": 0.0002, "epoch": 6.028602860286028, "step": 8220}, {"loss": 0.3541, "grad_norm": 1.2214244604110718, "learning_rate": 0.0002, "epoch": 6.035936927026036, "step": 8230}, {"loss": 0.4025, "grad_norm": 0.9455513954162598, "learning_rate": 0.0002, "epoch": 6.043270993766043, "step": 8240}, {"loss": 0.4448, "grad_norm": 0.9574248790740967, "learning_rate": 0.0002, "epoch": 6.050605060506051, "step": 8250}, {"loss": 0.4271, "grad_norm": 1.1022400856018066, "learning_rate": 0.0002, "epoch": 6.057939127246058, "step": 8260}, {"loss": 0.3603, "grad_norm": 0.9555122256278992, "learning_rate": 0.0002, "epoch": 6.065273193986065, "step": 8270}, {"loss": 0.4324, "grad_norm": 1.1956106424331665, "learning_rate": 0.0002, "epoch": 6.072607260726072, "step": 8280}, {"loss": 0.3924, "grad_norm": 1.3110876083374023, "learning_rate": 0.0002, "epoch": 6.07994132746608, "step": 8290}, {"loss": 0.3664, "grad_norm": 1.1293374300003052, "learning_rate": 0.0002, "epoch": 6.087275394206087, "step": 8300}, {"loss": 0.385, "grad_norm": 0.9176164269447327, "learning_rate": 0.0002, "epoch": 6.094609460946095, "step": 8310}, {"loss": 0.4142, "grad_norm": 0.9751231670379639, "learning_rate": 0.0002, "epoch": 6.101943527686102, "step": 8320}, {"loss": 0.4356, "grad_norm": 1.0536044836044312, "learning_rate": 0.0002, "epoch": 6.109277594426109, "step": 8330}, {"loss": 0.409, "grad_norm": 1.289342999458313, "learning_rate": 0.0002, "epoch": 6.116611661166116, "step": 8340}, {"loss": 0.4121, "grad_norm": 1.1773661375045776, "learning_rate": 0.0002, "epoch": 6.123945727906124, "step": 8350}, {"loss": 0.4499, "grad_norm": 1.2450661659240723, "learning_rate": 0.0002, "epoch": 6.131279794646131, "step": 8360}, {"loss": 0.4467, "grad_norm": 1.3965914249420166, "learning_rate": 0.0002, "epoch": 6.138613861386139, "step": 8370}, {"loss": 0.4024, "grad_norm": 1.3530808687210083, "learning_rate": 0.0002, "epoch": 6.145947928126146, "step": 8380}, {"loss": 0.4658, "grad_norm": 1.296276330947876, "learning_rate": 0.0002, "epoch": 6.153281994866154, "step": 8390}, {"loss": 0.5073, "grad_norm": 0.9759053587913513, "learning_rate": 0.0002, "epoch": 6.16061606160616, "step": 8400}, {"loss": 0.4718, "grad_norm": 1.2110707759857178, "learning_rate": 0.0002, "epoch": 6.167950128346168, "step": 8410}, {"loss": 0.4453, "grad_norm": 1.312226414680481, "learning_rate": 0.0002, "epoch": 6.175284195086175, "step": 8420}, {"loss": 0.4183, "grad_norm": 1.1696736812591553, "learning_rate": 0.0002, "epoch": 6.182618261826183, "step": 8430}, {"loss": 0.4546, "grad_norm": 1.260304570198059, "learning_rate": 0.0002, "epoch": 6.18995232856619, "step": 8440}, {"loss": 0.4137, "grad_norm": 1.472961187362671, "learning_rate": 0.0002, "epoch": 6.197286395306198, "step": 8450}, {"loss": 0.42, "grad_norm": 1.3618475198745728, "learning_rate": 0.0002, "epoch": 6.204620462046204, "step": 8460}, {"loss": 0.415, "grad_norm": 1.2544318437576294, "learning_rate": 0.0002, "epoch": 6.211954528786212, "step": 8470}, {"loss": 0.3907, "grad_norm": 1.205898642539978, "learning_rate": 0.0002, "epoch": 6.219288595526219, "step": 8480}, {"loss": 0.4431, "grad_norm": 0.9984724521636963, "learning_rate": 0.0002, "epoch": 6.226622662266227, "step": 8490}, {"loss": 0.4768, "grad_norm": 1.3184109926223755, "learning_rate": 0.0002, "epoch": 6.233956729006234, "step": 8500}, {"loss": 0.3859, "grad_norm": 1.135520100593567, "learning_rate": 0.0002, "epoch": 6.241290795746242, "step": 8510}, {"loss": 0.4159, "grad_norm": 1.4528400897979736, "learning_rate": 0.0002, "epoch": 6.248624862486249, "step": 8520}, {"loss": 0.4347, "grad_norm": 1.1222716569900513, "learning_rate": 0.0002, "epoch": 6.255958929226256, "step": 8530}, {"loss": 0.4581, "grad_norm": 1.7878046035766602, "learning_rate": 0.0002, "epoch": 6.263292995966263, "step": 8540}, {"loss": 0.4298, "grad_norm": 0.9789481163024902, "learning_rate": 0.0002, "epoch": 6.270627062706271, "step": 8550}, {"loss": 0.4316, "grad_norm": 1.151977300643921, "learning_rate": 0.0002, "epoch": 6.277961129446278, "step": 8560}, {"loss": 0.428, "grad_norm": 1.389968752861023, "learning_rate": 0.0002, "epoch": 6.2852951961862855, "step": 8570}, {"loss": 0.3903, "grad_norm": 0.884211003780365, "learning_rate": 0.0002, "epoch": 6.292629262926293, "step": 8580}, {"loss": 0.4611, "grad_norm": 1.3604296445846558, "learning_rate": 0.0002, "epoch": 6.2999633296663, "step": 8590}, {"loss": 0.4183, "grad_norm": 1.1845694780349731, "learning_rate": 0.0002, "epoch": 6.307297396406307, "step": 8600}, {"loss": 0.472, "grad_norm": 1.3231550455093384, "learning_rate": 0.0002, "epoch": 6.3146314631463145, "step": 8610}, {"loss": 0.3922, "grad_norm": 0.9546721577644348, "learning_rate": 0.0002, "epoch": 6.321965529886322, "step": 8620}, {"loss": 0.4395, "grad_norm": 1.2329787015914917, "learning_rate": 0.0002, "epoch": 6.3292995966263295, "step": 8630}, {"loss": 0.4344, "grad_norm": 1.0240199565887451, "learning_rate": 0.0002, "epoch": 6.336633663366337, "step": 8640}, {"loss": 0.4529, "grad_norm": 1.1866962909698486, "learning_rate": 0.0002, "epoch": 6.343967730106344, "step": 8650}, {"loss": 0.4575, "grad_norm": 1.2819687128067017, "learning_rate": 0.0002, "epoch": 6.351301796846351, "step": 8660}, {"loss": 0.455, "grad_norm": 0.9654944539070129, "learning_rate": 0.0002, "epoch": 6.3586358635863585, "step": 8670}, {"loss": 0.4739, "grad_norm": 0.9443874955177307, "learning_rate": 0.0002, "epoch": 6.365969930326366, "step": 8680}, {"loss": 0.435, "grad_norm": 1.2914115190505981, "learning_rate": 0.0002, "epoch": 6.373303997066373, "step": 8690}, {"loss": 0.4392, "grad_norm": 1.4558709859848022, "learning_rate": 0.0002, "epoch": 6.380638063806381, "step": 8700}, {"loss": 0.4398, "grad_norm": 1.3255952596664429, "learning_rate": 0.0002, "epoch": 6.387972130546388, "step": 8710}, {"loss": 0.4451, "grad_norm": 1.348742961883545, "learning_rate": 0.0002, "epoch": 6.395306197286395, "step": 8720}, {"loss": 0.41, "grad_norm": 1.0096025466918945, "learning_rate": 0.0002, "epoch": 6.402640264026402, "step": 8730}, {"loss": 0.4459, "grad_norm": 1.1720590591430664, "learning_rate": 0.0002, "epoch": 6.40997433076641, "step": 8740}, {"loss": 0.5059, "grad_norm": 1.1803077459335327, "learning_rate": 0.0002, "epoch": 6.417308397506417, "step": 8750}, {"loss": 0.4539, "grad_norm": 1.3649998903274536, "learning_rate": 0.0002, "epoch": 6.424642464246425, "step": 8760}, {"loss": 0.4171, "grad_norm": 1.1503992080688477, "learning_rate": 0.0002, "epoch": 6.431976530986432, "step": 8770}, {"loss": 0.488, "grad_norm": 1.1537176370620728, "learning_rate": 0.0002, "epoch": 6.43931059772644, "step": 8780}, {"loss": 0.4167, "grad_norm": 0.9743003845214844, "learning_rate": 0.0002, "epoch": 6.446644664466446, "step": 8790}, {"loss": 0.4813, "grad_norm": 0.9097744822502136, "learning_rate": 0.0002, "epoch": 6.453978731206454, "step": 8800}, {"loss": 0.4809, "grad_norm": 2.0174002647399902, "learning_rate": 0.0002, "epoch": 6.461312797946461, "step": 8810}, {"loss": 0.4879, "grad_norm": 1.0809309482574463, "learning_rate": 0.0002, "epoch": 6.468646864686469, "step": 8820}, {"loss": 0.4235, "grad_norm": 1.100294828414917, "learning_rate": 0.0002, "epoch": 6.475980931426476, "step": 8830}, {"loss": 0.4251, "grad_norm": 1.3707489967346191, "learning_rate": 0.0002, "epoch": 6.483314998166484, "step": 8840}, {"loss": 0.4533, "grad_norm": 1.1304761171340942, "learning_rate": 0.0002, "epoch": 6.49064906490649, "step": 8850}, {"loss": 0.4596, "grad_norm": 1.2171573638916016, "learning_rate": 0.0002, "epoch": 6.497983131646498, "step": 8860}, {"loss": 0.4694, "grad_norm": 1.0452901124954224, "learning_rate": 0.0002, "epoch": 6.505317198386505, "step": 8870}, {"loss": 0.4855, "grad_norm": 1.197298526763916, "learning_rate": 0.0002, "epoch": 6.512651265126513, "step": 8880}, {"loss": 0.4167, "grad_norm": 0.9179880619049072, "learning_rate": 0.0002, "epoch": 6.51998533186652, "step": 8890}, {"loss": 0.445, "grad_norm": 1.415079951286316, "learning_rate": 0.0002, "epoch": 6.527319398606528, "step": 8900}, {"loss": 0.424, "grad_norm": 1.1032487154006958, "learning_rate": 0.0002, "epoch": 6.534653465346535, "step": 8910}, {"loss": 0.4496, "grad_norm": 1.2295007705688477, "learning_rate": 0.0002, "epoch": 6.541987532086542, "step": 8920}, {"loss": 0.4755, "grad_norm": 1.4223219156265259, "learning_rate": 0.0002, "epoch": 6.549321598826549, "step": 8930}, {"loss": 0.4597, "grad_norm": 1.2785786390304565, "learning_rate": 0.0002, "epoch": 6.556655665566557, "step": 8940}, {"loss": 0.4651, "grad_norm": 1.3514775037765503, "learning_rate": 0.0002, "epoch": 6.563989732306564, "step": 8950}, {"loss": 0.4961, "grad_norm": 1.107937216758728, "learning_rate": 0.0002, "epoch": 6.571323799046572, "step": 8960}, {"loss": 0.4954, "grad_norm": 1.2839902639389038, "learning_rate": 0.0002, "epoch": 6.578657865786578, "step": 8970}, {"loss": 0.4207, "grad_norm": 0.9793244004249573, "learning_rate": 0.0002, "epoch": 6.585991932526586, "step": 8980}, {"loss": 0.4989, "grad_norm": 1.3403126001358032, "learning_rate": 0.0002, "epoch": 6.593325999266593, "step": 8990}, {"loss": 0.465, "grad_norm": 1.2612813711166382, "learning_rate": 0.0002, "epoch": 6.600660066006601, "step": 9000}, {"loss": 0.4589, "grad_norm": 1.4347625970840454, "learning_rate": 0.0002, "epoch": 6.607994132746608, "step": 9010}, {"loss": 0.4864, "grad_norm": 1.225921869277954, "learning_rate": 0.0002, "epoch": 6.6153281994866155, "step": 9020}, {"loss": 0.4364, "grad_norm": 1.033644676208496, "learning_rate": 0.0002, "epoch": 6.622662266226623, "step": 9030}, {"loss": 0.4698, "grad_norm": 1.1791894435882568, "learning_rate": 0.0002, "epoch": 6.6299963329666305, "step": 9040}, {"loss": 0.4908, "grad_norm": 1.0968137979507446, "learning_rate": 0.0002, "epoch": 6.637330399706637, "step": 9050}, {"loss": 0.4346, "grad_norm": 1.5639140605926514, "learning_rate": 0.0002, "epoch": 6.6446644664466445, "step": 9060}, {"loss": 0.4627, "grad_norm": 1.4158905744552612, "learning_rate": 0.0002, "epoch": 6.651998533186652, "step": 9070}, {"loss": 0.4619, "grad_norm": 1.2120254039764404, "learning_rate": 0.0002, "epoch": 6.6593325999266595, "step": 9080}, {"loss": 0.4564, "grad_norm": 1.1866531372070312, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 9090}, {"loss": 0.5175, "grad_norm": 1.2704026699066162, "learning_rate": 0.0002, "epoch": 6.6740007334066735, "step": 9100}, {"loss": 0.4859, "grad_norm": 1.1878353357315063, "learning_rate": 0.0002, "epoch": 6.681334800146681, "step": 9110}, {"loss": 0.4657, "grad_norm": 1.193995475769043, "learning_rate": 0.0002, "epoch": 6.6886688668866885, "step": 9120}, {"loss": 0.4939, "grad_norm": 1.2927545309066772, "learning_rate": 0.0002, "epoch": 6.696002933626696, "step": 9130}, {"loss": 0.4157, "grad_norm": 1.0770703554153442, "learning_rate": 0.0002, "epoch": 6.703337000366703, "step": 9140}, {"loss": 0.4571, "grad_norm": 1.2200851440429688, "learning_rate": 0.0002, "epoch": 6.710671067106711, "step": 9150}, {"loss": 0.4605, "grad_norm": 1.293891191482544, "learning_rate": 0.0002, "epoch": 6.718005133846718, "step": 9160}, {"loss": 0.5328, "grad_norm": 1.9376052618026733, "learning_rate": 0.0002, "epoch": 6.725339200586725, "step": 9170}, {"loss": 0.4861, "grad_norm": 1.0353254079818726, "learning_rate": 0.0002, "epoch": 6.732673267326732, "step": 9180}, {"loss": 0.5668, "grad_norm": 1.1274057626724243, "learning_rate": 0.0002, "epoch": 6.74000733406674, "step": 9190}, {"loss": 0.4486, "grad_norm": 1.3344064950942993, "learning_rate": 0.0002, "epoch": 6.747341400806747, "step": 9200}, {"loss": 0.49, "grad_norm": 1.303621768951416, "learning_rate": 0.0002, "epoch": 6.754675467546755, "step": 9210}, {"loss": 0.5059, "grad_norm": 1.2327780723571777, "learning_rate": 0.0002, "epoch": 6.762009534286762, "step": 9220}, {"loss": 0.486, "grad_norm": 1.3513109683990479, "learning_rate": 0.0002, "epoch": 6.769343601026769, "step": 9230}, {"loss": 0.5254, "grad_norm": 1.4762850999832153, "learning_rate": 0.0002, "epoch": 6.776677667766776, "step": 9240}, {"loss": 0.4181, "grad_norm": 1.0967189073562622, "learning_rate": 0.0002, "epoch": 6.784011734506784, "step": 9250}, {"loss": 0.4862, "grad_norm": 0.933936357498169, "learning_rate": 0.0002, "epoch": 6.791345801246791, "step": 9260}, {"loss": 0.4667, "grad_norm": 1.065553903579712, "learning_rate": 0.0002, "epoch": 6.798679867986799, "step": 9270}, {"loss": 0.5164, "grad_norm": 1.2044163942337036, "learning_rate": 0.0002, "epoch": 6.806013934726806, "step": 9280}, {"loss": 0.4648, "grad_norm": 1.404137134552002, "learning_rate": 0.0002, "epoch": 6.813348001466814, "step": 9290}, {"loss": 0.4442, "grad_norm": 1.4005582332611084, "learning_rate": 0.0002, "epoch": 6.82068206820682, "step": 9300}, {"loss": 0.459, "grad_norm": 1.1771104335784912, "learning_rate": 0.0002, "epoch": 6.828016134946828, "step": 9310}, {"loss": 0.5059, "grad_norm": 1.191933035850525, "learning_rate": 0.0002, "epoch": 6.835350201686835, "step": 9320}, {"loss": 0.4733, "grad_norm": 1.3395432233810425, "learning_rate": 0.0002, "epoch": 6.842684268426843, "step": 9330}, {"loss": 0.4882, "grad_norm": 1.4145503044128418, "learning_rate": 0.0002, "epoch": 6.85001833516685, "step": 9340}, {"loss": 0.4872, "grad_norm": 1.1128839254379272, "learning_rate": 0.0002, "epoch": 6.857352401906858, "step": 9350}, {"loss": 0.4909, "grad_norm": 1.0771174430847168, "learning_rate": 0.0002, "epoch": 6.864686468646864, "step": 9360}, {"loss": 0.4739, "grad_norm": 1.1089814901351929, "learning_rate": 0.0002, "epoch": 6.872020535386872, "step": 9370}, {"loss": 0.4854, "grad_norm": 1.078444004058838, "learning_rate": 0.0002, "epoch": 6.879354602126879, "step": 9380}, {"loss": 0.4904, "grad_norm": 1.3676636219024658, "learning_rate": 0.0002, "epoch": 6.886688668866887, "step": 9390}, {"loss": 0.4854, "grad_norm": 0.8973749876022339, "learning_rate": 0.0002, "epoch": 6.894022735606894, "step": 9400}, {"loss": 0.4274, "grad_norm": 1.141552448272705, "learning_rate": 0.0002, "epoch": 6.901356802346902, "step": 9410}, {"loss": 0.4972, "grad_norm": 0.8345359563827515, "learning_rate": 0.0002, "epoch": 6.908690869086909, "step": 9420}, {"loss": 0.5218, "grad_norm": 1.1602197885513306, "learning_rate": 0.0002, "epoch": 6.916024935826916, "step": 9430}, {"loss": 0.4911, "grad_norm": 1.275466799736023, "learning_rate": 0.0002, "epoch": 6.923359002566923, "step": 9440}, {"loss": 0.4904, "grad_norm": 0.9186071157455444, "learning_rate": 0.0002, "epoch": 6.930693069306931, "step": 9450}, {"loss": 0.4604, "grad_norm": 0.9069198966026306, "learning_rate": 0.0002, "epoch": 6.938027136046938, "step": 9460}, {"loss": 0.4363, "grad_norm": 1.2331899404525757, "learning_rate": 0.0002, "epoch": 6.945361202786946, "step": 9470}, {"loss": 0.4815, "grad_norm": 0.8685150742530823, "learning_rate": 0.0002, "epoch": 6.952695269526953, "step": 9480}, {"loss": 0.4424, "grad_norm": 1.4067939519882202, "learning_rate": 0.0002, "epoch": 6.96002933626696, "step": 9490}, {"loss": 0.5089, "grad_norm": 1.1864029169082642, "learning_rate": 0.0002, "epoch": 6.967363403006967, "step": 9500}, {"loss": 0.4906, "grad_norm": 1.3697725534439087, "learning_rate": 0.0002, "epoch": 6.974697469746975, "step": 9510}, {"loss": 0.4797, "grad_norm": 1.1632893085479736, "learning_rate": 0.0002, "epoch": 6.982031536486982, "step": 9520}, {"loss": 0.4526, "grad_norm": 1.1447268724441528, "learning_rate": 0.0002, "epoch": 6.9893656032269895, "step": 9530}, {"loss": 0.4627, "grad_norm": 1.5017213821411133, "learning_rate": 0.0002, "epoch": 6.996699669966997, "step": 9540}, {"eval_loss": 1.4178194999694824, "eval_runtime": 32.7488, "eval_samples_per_second": 13.161, "eval_steps_per_second": 1.649, "epoch": 6.999633296662999, "step": 9544}, {"loss": 0.4396, "grad_norm": 1.110981822013855, "learning_rate": 0.0002, "epoch": 7.0040337367070045, "step": 9550}, {"loss": 0.3475, "grad_norm": 1.2793253660202026, "learning_rate": 0.0002, "epoch": 7.011367803447011, "step": 9560}, {"loss": 0.4022, "grad_norm": 1.1258823871612549, "learning_rate": 0.0002, "epoch": 7.0187018701870185, "step": 9570}, {"loss": 0.3008, "grad_norm": 1.410486102104187, "learning_rate": 0.0002, "epoch": 7.026035936927026, "step": 9580}, {"loss": 0.3716, "grad_norm": 1.2088500261306763, "learning_rate": 0.0002, "epoch": 7.0333700036670335, "step": 9590}, {"loss": 0.3379, "grad_norm": 0.8303650617599487, "learning_rate": 0.0002, "epoch": 7.040704070407041, "step": 9600}, {"loss": 0.3376, "grad_norm": 0.9813525080680847, "learning_rate": 0.0002, "epoch": 7.048038137147048, "step": 9610}, {"loss": 0.3202, "grad_norm": 0.9679017066955566, "learning_rate": 0.0002, "epoch": 7.055372203887055, "step": 9620}, {"loss": 0.3287, "grad_norm": 1.1532220840454102, "learning_rate": 0.0002, "epoch": 7.0627062706270625, "step": 9630}, {"loss": 0.3639, "grad_norm": 1.312053918838501, "learning_rate": 0.0002, "epoch": 7.07004033736707, "step": 9640}, {"loss": 0.3278, "grad_norm": 1.0594364404678345, "learning_rate": 0.0002, "epoch": 7.077374404107077, "step": 9650}, {"loss": 0.3259, "grad_norm": 1.545080542564392, "learning_rate": 0.0002, "epoch": 7.084708470847085, "step": 9660}, {"loss": 0.328, "grad_norm": 1.1748381853103638, "learning_rate": 0.0002, "epoch": 7.092042537587092, "step": 9670}, {"loss": 0.3313, "grad_norm": 1.6107453107833862, "learning_rate": 0.0002, "epoch": 7.0993766043271, "step": 9680}, {"loss": 0.3469, "grad_norm": 0.9478244185447693, "learning_rate": 0.0002, "epoch": 7.106710671067106, "step": 9690}, {"loss": 0.3289, "grad_norm": 1.508410930633545, "learning_rate": 0.0002, "epoch": 7.114044737807114, "step": 9700}, {"loss": 0.3077, "grad_norm": 1.3175169229507446, "learning_rate": 0.0002, "epoch": 7.121378804547121, "step": 9710}, {"loss": 0.3241, "grad_norm": 1.2631924152374268, "learning_rate": 0.0002, "epoch": 7.128712871287129, "step": 9720}, {"loss": 0.3806, "grad_norm": 1.0640755891799927, "learning_rate": 0.0002, "epoch": 7.136046938027136, "step": 9730}, {"loss": 0.3418, "grad_norm": 1.247279405593872, "learning_rate": 0.0002, "epoch": 7.143381004767144, "step": 9740}, {"loss": 0.3385, "grad_norm": 1.2538974285125732, "learning_rate": 0.0002, "epoch": 7.15071507150715, "step": 9750}, {"loss": 0.3445, "grad_norm": 1.3157252073287964, "learning_rate": 0.0002, "epoch": 7.158049138247158, "step": 9760}, {"loss": 0.3518, "grad_norm": 1.5254220962524414, "learning_rate": 0.0002, "epoch": 7.165383204987165, "step": 9770}, {"loss": 0.3575, "grad_norm": 1.0063719749450684, "learning_rate": 0.0002, "epoch": 7.172717271727173, "step": 9780}, {"loss": 0.3701, "grad_norm": 0.8030351996421814, "learning_rate": 0.0002, "epoch": 7.18005133846718, "step": 9790}, {"loss": 0.3645, "grad_norm": 1.2086257934570312, "learning_rate": 0.0002, "epoch": 7.187385405207188, "step": 9800}, {"loss": 0.354, "grad_norm": 1.7020413875579834, "learning_rate": 0.0002, "epoch": 7.194719471947194, "step": 9810}, {"loss": 0.3242, "grad_norm": 1.2517976760864258, "learning_rate": 0.0002, "epoch": 7.202053538687202, "step": 9820}, {"loss": 0.3503, "grad_norm": 1.330505132675171, "learning_rate": 0.0002, "epoch": 7.209387605427209, "step": 9830}, {"loss": 0.3349, "grad_norm": 1.1273366212844849, "learning_rate": 0.0002, "epoch": 7.216721672167217, "step": 9840}, {"loss": 0.3562, "grad_norm": 1.3738148212432861, "learning_rate": 0.0002, "epoch": 7.224055738907224, "step": 9850}, {"loss": 0.3622, "grad_norm": 1.2162928581237793, "learning_rate": 0.0002, "epoch": 7.231389805647232, "step": 9860}, {"loss": 0.3913, "grad_norm": 1.743969440460205, "learning_rate": 0.0002, "epoch": 7.238723872387239, "step": 9870}, {"loss": 0.3855, "grad_norm": 1.5357484817504883, "learning_rate": 0.0002, "epoch": 7.246057939127246, "step": 9880}, {"loss": 0.3556, "grad_norm": 1.342976450920105, "learning_rate": 0.0002, "epoch": 7.253392005867253, "step": 9890}, {"loss": 0.3787, "grad_norm": 1.428523302078247, "learning_rate": 0.0002, "epoch": 7.260726072607261, "step": 9900}, {"loss": 0.343, "grad_norm": 1.5631695985794067, "learning_rate": 0.0002, "epoch": 7.268060139347268, "step": 9910}, {"loss": 0.3292, "grad_norm": 1.192564606666565, "learning_rate": 0.0002, "epoch": 7.275394206087276, "step": 9920}, {"loss": 0.324, "grad_norm": 1.1428006887435913, "learning_rate": 0.0002, "epoch": 7.282728272827283, "step": 9930}, {"loss": 0.2868, "grad_norm": 1.1959515810012817, "learning_rate": 0.0002, "epoch": 7.29006233956729, "step": 9940}, {"loss": 0.3623, "grad_norm": 0.984326958656311, "learning_rate": 0.0002, "epoch": 7.297396406307297, "step": 9950}, {"loss": 0.3442, "grad_norm": 1.1258848905563354, "learning_rate": 0.0002, "epoch": 7.304730473047305, "step": 9960}, {"loss": 0.3682, "grad_norm": 1.2445521354675293, "learning_rate": 0.0002, "epoch": 7.312064539787312, "step": 9970}, {"loss": 0.4032, "grad_norm": 1.066351294517517, "learning_rate": 0.0002, "epoch": 7.3193986065273196, "step": 9980}, {"loss": 0.3689, "grad_norm": 1.3111763000488281, "learning_rate": 0.0002, "epoch": 7.326732673267327, "step": 9990}, {"loss": 0.4251, "grad_norm": 1.5113508701324463, "learning_rate": 0.0002, "epoch": 7.334066740007334, "step": 10000}, {"loss": 0.3668, "grad_norm": 1.2499724626541138, "learning_rate": 0.0002, "epoch": 7.341400806747341, "step": 10010}, {"loss": 0.392, "grad_norm": 1.1003477573394775, "learning_rate": 0.0002, "epoch": 7.3487348734873486, "step": 10020}, {"loss": 0.4045, "grad_norm": 1.4911425113677979, "learning_rate": 0.0002, "epoch": 7.356068940227356, "step": 10030}, {"loss": 0.3892, "grad_norm": 1.291712999343872, "learning_rate": 0.0002, "epoch": 7.3634030069673635, "step": 10040}, {"loss": 0.3817, "grad_norm": 1.4001942873001099, "learning_rate": 0.0002, "epoch": 7.370737073707371, "step": 10050}, {"loss": 0.404, "grad_norm": 2.015535593032837, "learning_rate": 0.0002, "epoch": 7.378071140447378, "step": 10060}, {"loss": 0.3758, "grad_norm": 1.3355735540390015, "learning_rate": 0.0002, "epoch": 7.385405207187385, "step": 10070}, {"loss": 0.3764, "grad_norm": 1.1258678436279297, "learning_rate": 0.0002, "epoch": 7.3927392739273925, "step": 10080}, {"loss": 0.3827, "grad_norm": 1.3883707523345947, "learning_rate": 0.0002, "epoch": 7.4000733406674, "step": 10090}, {"loss": 0.3623, "grad_norm": 1.144474744796753, "learning_rate": 0.0002, "epoch": 7.407407407407407, "step": 10100}, {"loss": 0.4024, "grad_norm": 1.636843204498291, "learning_rate": 0.0002, "epoch": 7.414741474147415, "step": 10110}, {"loss": 0.3924, "grad_norm": 1.6167247295379639, "learning_rate": 0.0002, "epoch": 7.422075540887422, "step": 10120}, {"loss": 0.4233, "grad_norm": 1.3800078630447388, "learning_rate": 0.0002, "epoch": 7.429409607627429, "step": 10130}, {"loss": 0.3859, "grad_norm": 1.2631969451904297, "learning_rate": 0.0002, "epoch": 7.436743674367436, "step": 10140}, {"loss": 0.3523, "grad_norm": 1.32834792137146, "learning_rate": 0.0002, "epoch": 7.444077741107444, "step": 10150}, {"loss": 0.3945, "grad_norm": 1.370316982269287, "learning_rate": 0.0002, "epoch": 7.451411807847451, "step": 10160}, {"loss": 0.3695, "grad_norm": 1.6096234321594238, "learning_rate": 0.0002, "epoch": 7.458745874587459, "step": 10170}, {"loss": 0.3378, "grad_norm": 1.3638662099838257, "learning_rate": 0.0002, "epoch": 7.466079941327466, "step": 10180}, {"loss": 0.4015, "grad_norm": 1.3508107662200928, "learning_rate": 0.0002, "epoch": 7.473414008067474, "step": 10190}, {"loss": 0.4169, "grad_norm": 1.5599194765090942, "learning_rate": 0.0002, "epoch": 7.48074807480748, "step": 10200}, {"loss": 0.4071, "grad_norm": 1.4922538995742798, "learning_rate": 0.0002, "epoch": 7.488082141547488, "step": 10210}, {"loss": 0.419, "grad_norm": 1.485437273979187, "learning_rate": 0.0002, "epoch": 7.495416208287495, "step": 10220}, {"loss": 0.382, "grad_norm": 0.9040785431861877, "learning_rate": 0.0002, "epoch": 7.502750275027503, "step": 10230}, {"loss": 0.3437, "grad_norm": 1.2453011274337769, "learning_rate": 0.0002, "epoch": 7.51008434176751, "step": 10240}, {"loss": 0.4063, "grad_norm": 1.4167460203170776, "learning_rate": 0.0002, "epoch": 7.517418408507518, "step": 10250}, {"loss": 0.402, "grad_norm": 1.396972894668579, "learning_rate": 0.0002, "epoch": 7.524752475247524, "step": 10260}, {"loss": 0.3658, "grad_norm": 1.384286880493164, "learning_rate": 0.0002, "epoch": 7.532086541987532, "step": 10270}, {"loss": 0.3813, "grad_norm": 1.478095531463623, "learning_rate": 0.0002, "epoch": 7.539420608727539, "step": 10280}, {"loss": 0.3813, "grad_norm": 1.2642205953598022, "learning_rate": 0.0002, "epoch": 7.546754675467547, "step": 10290}, {"loss": 0.4084, "grad_norm": 1.1110541820526123, "learning_rate": 0.0002, "epoch": 7.554088742207554, "step": 10300}, {"loss": 0.418, "grad_norm": 1.1147890090942383, "learning_rate": 0.0002, "epoch": 7.561422808947562, "step": 10310}, {"loss": 0.4148, "grad_norm": 1.5677998065948486, "learning_rate": 0.0002, "epoch": 7.568756875687569, "step": 10320}, {"loss": 0.3675, "grad_norm": 0.9347636699676514, "learning_rate": 0.0002, "epoch": 7.576090942427576, "step": 10330}, {"loss": 0.3534, "grad_norm": 1.1126737594604492, "learning_rate": 0.0002, "epoch": 7.583425009167583, "step": 10340}, {"loss": 0.418, "grad_norm": 1.462611198425293, "learning_rate": 0.0002, "epoch": 7.590759075907591, "step": 10350}, {"loss": 0.3755, "grad_norm": 0.9907522201538086, "learning_rate": 0.0002, "epoch": 7.598093142647598, "step": 10360}, {"loss": 0.4133, "grad_norm": 1.306152582168579, "learning_rate": 0.0002, "epoch": 7.605427209387606, "step": 10370}, {"loss": 0.3644, "grad_norm": 1.11135995388031, "learning_rate": 0.0002, "epoch": 7.612761276127613, "step": 10380}, {"loss": 0.3659, "grad_norm": 1.0825806856155396, "learning_rate": 0.0002, "epoch": 7.62009534286762, "step": 10390}, {"loss": 0.3952, "grad_norm": 1.5346975326538086, "learning_rate": 0.0002, "epoch": 7.627429409607627, "step": 10400}, {"loss": 0.3807, "grad_norm": 1.5885388851165771, "learning_rate": 0.0002, "epoch": 7.634763476347635, "step": 10410}, {"loss": 0.3751, "grad_norm": 1.130261778831482, "learning_rate": 0.0002, "epoch": 7.642097543087642, "step": 10420}, {"loss": 0.3598, "grad_norm": 1.2318342924118042, "learning_rate": 0.0002, "epoch": 7.64943160982765, "step": 10430}, {"loss": 0.3823, "grad_norm": 1.07103431224823, "learning_rate": 0.0002, "epoch": 7.656765676567657, "step": 10440}, {"loss": 0.4707, "grad_norm": 1.3836923837661743, "learning_rate": 0.0002, "epoch": 7.6640997433076645, "step": 10450}, {"loss": 0.3829, "grad_norm": 1.2110271453857422, "learning_rate": 0.0002, "epoch": 7.671433810047671, "step": 10460}, {"loss": 0.3747, "grad_norm": 1.2304844856262207, "learning_rate": 0.0002, "epoch": 7.678767876787679, "step": 10470}, {"loss": 0.3761, "grad_norm": 1.3444706201553345, "learning_rate": 0.0002, "epoch": 7.686101943527686, "step": 10480}, {"loss": 0.3772, "grad_norm": 1.151705026626587, "learning_rate": 0.0002, "epoch": 7.6934360102676935, "step": 10490}, {"loss": 0.3351, "grad_norm": 1.4373983144760132, "learning_rate": 0.0002, "epoch": 7.700770077007701, "step": 10500}, {"loss": 0.389, "grad_norm": 1.5898514986038208, "learning_rate": 0.0002, "epoch": 7.7081041437477085, "step": 10510}, {"loss": 0.3679, "grad_norm": 1.3767389059066772, "learning_rate": 0.0002, "epoch": 7.715438210487715, "step": 10520}, {"loss": 0.4023, "grad_norm": 1.3730027675628662, "learning_rate": 0.0002, "epoch": 7.7227722772277225, "step": 10530}, {"loss": 0.4291, "grad_norm": 1.3917304277420044, "learning_rate": 0.0002, "epoch": 7.73010634396773, "step": 10540}, {"loss": 0.4378, "grad_norm": 1.230663776397705, "learning_rate": 0.0002, "epoch": 7.7374404107077375, "step": 10550}, {"loss": 0.4289, "grad_norm": 0.9922441244125366, "learning_rate": 0.0002, "epoch": 7.744774477447745, "step": 10560}, {"loss": 0.4118, "grad_norm": 1.4576551914215088, "learning_rate": 0.0002, "epoch": 7.752108544187752, "step": 10570}, {"loss": 0.3793, "grad_norm": 1.061685562133789, "learning_rate": 0.0002, "epoch": 7.75944261092776, "step": 10580}, {"loss": 0.3748, "grad_norm": 1.1523276567459106, "learning_rate": 0.0002, "epoch": 7.7667766776677665, "step": 10590}, {"loss": 0.3982, "grad_norm": 1.3917267322540283, "learning_rate": 0.0002, "epoch": 7.774110744407774, "step": 10600}, {"loss": 0.4305, "grad_norm": 1.1314283609390259, "learning_rate": 0.0002, "epoch": 7.781444811147781, "step": 10610}, {"loss": 0.4244, "grad_norm": 1.624324083328247, "learning_rate": 0.0002, "epoch": 7.788778877887789, "step": 10620}, {"loss": 0.4129, "grad_norm": 1.5369168519973755, "learning_rate": 0.0002, "epoch": 7.796112944627796, "step": 10630}, {"loss": 0.3661, "grad_norm": 1.082222819328308, "learning_rate": 0.0002, "epoch": 7.803447011367804, "step": 10640}, {"loss": 0.4341, "grad_norm": 1.281540870666504, "learning_rate": 0.0002, "epoch": 7.81078107810781, "step": 10650}, {"loss": 0.3939, "grad_norm": 1.1889171600341797, "learning_rate": 0.0002, "epoch": 7.818115144847818, "step": 10660}, {"loss": 0.3923, "grad_norm": 1.5310896635055542, "learning_rate": 0.0002, "epoch": 7.825449211587825, "step": 10670}, {"loss": 0.4148, "grad_norm": 1.6724708080291748, "learning_rate": 0.0002, "epoch": 7.832783278327833, "step": 10680}, {"loss": 0.363, "grad_norm": 1.3742409944534302, "learning_rate": 0.0002, "epoch": 7.84011734506784, "step": 10690}, {"loss": 0.3599, "grad_norm": 1.2421947717666626, "learning_rate": 0.0002, "epoch": 7.847451411807848, "step": 10700}, {"loss": 0.3829, "grad_norm": 1.0233848094940186, "learning_rate": 0.0002, "epoch": 7.854785478547855, "step": 10710}, {"loss": 0.4265, "grad_norm": 1.640842080116272, "learning_rate": 0.0002, "epoch": 7.862119545287862, "step": 10720}, {"loss": 0.4257, "grad_norm": 1.3571926355361938, "learning_rate": 0.0002, "epoch": 7.869453612027869, "step": 10730}, {"loss": 0.4454, "grad_norm": 1.459564208984375, "learning_rate": 0.0002, "epoch": 7.876787678767877, "step": 10740}, {"loss": 0.3903, "grad_norm": 0.9202831387519836, "learning_rate": 0.0002, "epoch": 7.884121745507884, "step": 10750}, {"loss": 0.4149, "grad_norm": 1.3509176969528198, "learning_rate": 0.0002, "epoch": 7.891455812247892, "step": 10760}, {"loss": 0.4001, "grad_norm": 1.5858603715896606, "learning_rate": 0.0002, "epoch": 7.898789878987898, "step": 10770}, {"loss": 0.3753, "grad_norm": 1.2391952276229858, "learning_rate": 0.0002, "epoch": 7.906123945727906, "step": 10780}, {"loss": 0.4085, "grad_norm": 1.3442552089691162, "learning_rate": 0.0002, "epoch": 7.913458012467913, "step": 10790}, {"loss": 0.4377, "grad_norm": 1.7327884435653687, "learning_rate": 0.0002, "epoch": 7.920792079207921, "step": 10800}, {"loss": 0.376, "grad_norm": 1.4246922731399536, "learning_rate": 0.0002, "epoch": 7.928126145947928, "step": 10810}, {"loss": 0.4158, "grad_norm": 1.4421411752700806, "learning_rate": 0.0002, "epoch": 7.935460212687936, "step": 10820}, {"loss": 0.4084, "grad_norm": 1.3445014953613281, "learning_rate": 0.0002, "epoch": 7.942794279427943, "step": 10830}, {"loss": 0.3986, "grad_norm": 1.2219295501708984, "learning_rate": 0.0002, "epoch": 7.950128346167951, "step": 10840}, {"loss": 0.428, "grad_norm": 1.241843342781067, "learning_rate": 0.0002, "epoch": 7.957462412907957, "step": 10850}, {"loss": 0.3776, "grad_norm": 0.9814007878303528, "learning_rate": 0.0002, "epoch": 7.964796479647965, "step": 10860}, {"loss": 0.4866, "grad_norm": 1.4015462398529053, "learning_rate": 0.0002, "epoch": 7.972130546387972, "step": 10870}, {"loss": 0.437, "grad_norm": 1.4638406038284302, "learning_rate": 0.0002, "epoch": 7.97946461312798, "step": 10880}, {"loss": 0.4191, "grad_norm": 1.585194706916809, "learning_rate": 0.0002, "epoch": 7.986798679867987, "step": 10890}, {"loss": 0.3969, "grad_norm": 1.197031855583191, "learning_rate": 0.0002, "epoch": 7.994132746607994, "step": 10900}]}