diff --git a/.gitattributes b/.gitattributes index e7c9b529c9d0119cdbfc2f190fb6ee13d1624895..f1c84864ebdf27f0f951808b46e41461ed9fae79 100644 --- a/.gitattributes +++ b/.gitattributes @@ -938,3 +938,12 @@ gemma-2b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32 gemma-2b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-308-sd-42/checkpoint-64/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-308-sd-42/checkpoint-96/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-308-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f512b3457d4ef0662b10279451f78e55458a70a9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8767e0d0604ccff5925fd06b1ebf5a468436644640dabefecdff8b512facf8aa +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..118a6342a9d425a40112200b0ade39aeb26e6074 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abcdfcac6f09b8d04b847f0786435c773d18d77630561c9d325f90ae90a09178 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ef2f91a640c4bd3bb25d9cc246e46c9c1b847b2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4baeb9cc32972d690c5b7a9b98b0eb28c6b8afd2ce9c908058754805719d6dd +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b65f38e866f2e7644f8c295eb82181a90805765c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26f3421092a5ba4c2db1ce51d3b93698e8a2c6e0239127b2a377690d0d1f4cc1 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddf6305008abf7ffbeb48eb5fde5e3982aded719 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9340a5bf0cf4c7a2611e19a4b084d5b6085acada4269b8e0cfcdade92431d60d +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3552e32a37010bd9bf54689b68d9f664311ca981 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/trainer_state.json @@ -0,0 +1,7587 @@ +{ + "best_metric": 1.8046749830245972, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", + "epoch": 7.997011580127008, + "eval_steps": 10, + "global_step": 10704, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007471049682480389, + "grad_norm": 0.4912872612476349, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 10 + }, + { + "epoch": 0.014942099364960777, + "grad_norm": 0.4856316149234772, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 20 + }, + { + "epoch": 0.022413149047441166, + "grad_norm": 0.47683125734329224, + "learning_rate": 0.0002, + "loss": 2.0957, + "step": 30 + }, + { + "epoch": 0.029884198729921554, + "grad_norm": 0.515082597732544, + "learning_rate": 0.0002, + "loss": 1.8908, + "step": 40 + }, + { + "epoch": 0.03735524841240194, + "grad_norm": 0.5299215316772461, + "learning_rate": 0.0002, + "loss": 1.9704, + "step": 50 + }, + { + "epoch": 0.04482629809488233, + "grad_norm": 0.4951399862766266, + "learning_rate": 0.0002, + "loss": 1.9225, + "step": 60 + }, + { + "epoch": 0.05229734777736272, + "grad_norm": 0.48079821467399597, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05976839745984311, + "grad_norm": 0.49402132630348206, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 80 + }, + { + "epoch": 0.0672394471423235, + "grad_norm": 0.4778193235397339, + "learning_rate": 0.0002, + "loss": 1.8691, + "step": 90 + }, + { + "epoch": 0.07471049682480388, + "grad_norm": 0.42472657561302185, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 100 + }, + { + "epoch": 0.08218154650728428, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0002, + "loss": 1.8744, + "step": 110 + }, + { + "epoch": 0.08965259618976466, + "grad_norm": 0.4472862780094147, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 120 + }, + { + "epoch": 0.09712364587224505, + "grad_norm": 0.42596298456192017, + "learning_rate": 0.0002, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.10459469555472543, + "grad_norm": 0.46645811200141907, + "learning_rate": 0.0002, + "loss": 1.8015, + "step": 140 + }, + { + "epoch": 0.11206574523720583, + "grad_norm": 0.41041234135627747, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 150 + }, + { + "epoch": 0.11953679491968622, + "grad_norm": 0.5329819917678833, + "learning_rate": 0.0002, + "loss": 1.8276, + "step": 160 + }, + { + "epoch": 0.1270078446021666, + "grad_norm": 0.4065922200679779, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 170 + }, + { + "epoch": 0.134478894284647, + "grad_norm": 0.38406994938850403, + "learning_rate": 0.0002, + "loss": 1.8559, + "step": 180 + }, + { + "epoch": 0.14194994396712737, + "grad_norm": 0.4246881306171417, + "learning_rate": 0.0002, + "loss": 1.8647, + "step": 190 + }, + { + "epoch": 0.14942099364960776, + "grad_norm": 0.35136649012565613, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 200 + }, + { + "epoch": 0.15689204333208817, + "grad_norm": 0.43252742290496826, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.16436309301456856, + "grad_norm": 0.39236941933631897, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 220 + }, + { + "epoch": 0.17183414269704894, + "grad_norm": 0.3748249113559723, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 230 + }, + { + "epoch": 0.17930519237952933, + "grad_norm": 0.6432855725288391, + "learning_rate": 0.0002, + "loss": 1.866, + "step": 240 + }, + { + "epoch": 0.1867762420620097, + "grad_norm": 0.34874802827835083, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 250 + }, + { + "epoch": 0.1942472917444901, + "grad_norm": 0.3721984326839447, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 260 + }, + { + "epoch": 0.20171834142697048, + "grad_norm": 0.4339311420917511, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 270 + }, + { + "epoch": 0.20918939110945087, + "grad_norm": 0.4018215537071228, + "learning_rate": 0.0002, + "loss": 1.8665, + "step": 280 + }, + { + "epoch": 0.21666044079193125, + "grad_norm": 0.3278839886188507, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 290 + }, + { + "epoch": 0.22413149047441167, + "grad_norm": 0.36146077513694763, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 300 + }, + { + "epoch": 0.23160254015689205, + "grad_norm": 0.38175010681152344, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 310 + }, + { + "epoch": 0.23907358983937244, + "grad_norm": 0.44776618480682373, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.24654463952185282, + "grad_norm": 0.3933652937412262, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 330 + }, + { + "epoch": 0.2540156892043332, + "grad_norm": 0.3515005111694336, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 340 + }, + { + "epoch": 0.2614867388868136, + "grad_norm": 0.6683304309844971, + "learning_rate": 0.0002, + "loss": 1.8653, + "step": 350 + }, + { + "epoch": 0.268957788569294, + "grad_norm": 0.37093454599380493, + "learning_rate": 0.0002, + "loss": 1.8797, + "step": 360 + }, + { + "epoch": 0.2764288382517744, + "grad_norm": 0.3450651168823242, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 370 + }, + { + "epoch": 0.28389988793425475, + "grad_norm": 0.5140917301177979, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 380 + }, + { + "epoch": 0.29137093761673516, + "grad_norm": 0.32885563373565674, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 390 + }, + { + "epoch": 0.2988419872992155, + "grad_norm": 0.33962297439575195, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.30631303698169593, + "grad_norm": 0.3723141849040985, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 410 + }, + { + "epoch": 0.31378408666417634, + "grad_norm": 0.37173134088516235, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 420 + }, + { + "epoch": 0.3212551363466567, + "grad_norm": 0.33736956119537354, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 430 + }, + { + "epoch": 0.3287261860291371, + "grad_norm": 0.3602448105812073, + "learning_rate": 0.0002, + "loss": 1.8367, + "step": 440 + }, + { + "epoch": 0.33619723571161747, + "grad_norm": 0.3569699227809906, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 450 + }, + { + "epoch": 0.3436682853940979, + "grad_norm": 0.31009167432785034, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 460 + }, + { + "epoch": 0.35113933507657824, + "grad_norm": 0.5278693437576294, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 470 + }, + { + "epoch": 0.35861038475905865, + "grad_norm": 0.3587537109851837, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 480 + }, + { + "epoch": 0.366081434441539, + "grad_norm": 0.3859670162200928, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 490 + }, + { + "epoch": 0.3735524841240194, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 500 + }, + { + "epoch": 0.38102353380649984, + "grad_norm": 0.35052940249443054, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 510 + }, + { + "epoch": 0.3884945834889802, + "grad_norm": 0.2979494333267212, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 520 + }, + { + "epoch": 0.3959656331714606, + "grad_norm": 0.3062683343887329, + "learning_rate": 0.0002, + "loss": 1.8641, + "step": 530 + }, + { + "epoch": 0.40343668285394096, + "grad_norm": 0.3172847330570221, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 540 + }, + { + "epoch": 0.4109077325364214, + "grad_norm": 0.360435426235199, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 550 + }, + { + "epoch": 0.41837878221890173, + "grad_norm": 0.3427872359752655, + "learning_rate": 0.0002, + "loss": 1.9054, + "step": 560 + }, + { + "epoch": 0.42584983190138215, + "grad_norm": 0.34036558866500854, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 570 + }, + { + "epoch": 0.4333208815838625, + "grad_norm": 0.3365345299243927, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 580 + }, + { + "epoch": 0.4407919312663429, + "grad_norm": 0.35619041323661804, + "learning_rate": 0.0002, + "loss": 1.8328, + "step": 590 + }, + { + "epoch": 0.44826298094882333, + "grad_norm": 0.3569088280200958, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 600 + }, + { + "epoch": 0.4557340306313037, + "grad_norm": 0.3581278622150421, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 610 + }, + { + "epoch": 0.4632050803137841, + "grad_norm": 0.43197110295295715, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 620 + }, + { + "epoch": 0.47067612999626446, + "grad_norm": 0.33966198563575745, + "learning_rate": 0.0002, + "loss": 1.8257, + "step": 630 + }, + { + "epoch": 0.47814717967874487, + "grad_norm": 0.3343866467475891, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 640 + }, + { + "epoch": 0.48561822936122523, + "grad_norm": 0.33878564834594727, + "learning_rate": 0.0002, + "loss": 1.8191, + "step": 650 + }, + { + "epoch": 0.49308927904370564, + "grad_norm": 0.387195885181427, + "learning_rate": 0.0002, + "loss": 1.8801, + "step": 660 + }, + { + "epoch": 0.500560328726186, + "grad_norm": 0.3755440413951874, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 670 + }, + { + "epoch": 0.5080313784086664, + "grad_norm": 0.3272816836833954, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 680 + }, + { + "epoch": 0.5155024280911468, + "grad_norm": 0.36063864827156067, + "learning_rate": 0.0002, + "loss": 1.8156, + "step": 690 + }, + { + "epoch": 0.5229734777736272, + "grad_norm": 0.35317373275756836, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 700 + }, + { + "epoch": 0.5304445274561076, + "grad_norm": 0.3561195433139801, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 710 + }, + { + "epoch": 0.537915577138588, + "grad_norm": 0.31124624609947205, + "learning_rate": 0.0002, + "loss": 1.8149, + "step": 720 + }, + { + "epoch": 0.5453866268210683, + "grad_norm": 0.3294544517993927, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 730 + }, + { + "epoch": 0.5528576765035488, + "grad_norm": 0.31933900713920593, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 740 + }, + { + "epoch": 0.5603287261860291, + "grad_norm": 0.3226020634174347, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 750 + }, + { + "epoch": 0.5677997758685095, + "grad_norm": 0.3147525489330292, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 760 + }, + { + "epoch": 0.57527082555099, + "grad_norm": 0.32234328985214233, + "learning_rate": 0.0002, + "loss": 1.9028, + "step": 770 + }, + { + "epoch": 0.5827418752334703, + "grad_norm": 0.3258664309978485, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 780 + }, + { + "epoch": 0.5902129249159507, + "grad_norm": 0.3166961967945099, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 790 + }, + { + "epoch": 0.597683974598431, + "grad_norm": 0.35621458292007446, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 800 + }, + { + "epoch": 0.6051550242809115, + "grad_norm": 0.3236999213695526, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 810 + }, + { + "epoch": 0.6126260739633919, + "grad_norm": 0.2892923653125763, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 820 + }, + { + "epoch": 0.6200971236458722, + "grad_norm": 0.4098321497440338, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 830 + }, + { + "epoch": 0.6275681733283527, + "grad_norm": 0.3337118923664093, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 840 + }, + { + "epoch": 0.635039223010833, + "grad_norm": 0.30416029691696167, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 850 + }, + { + "epoch": 0.6425102726933134, + "grad_norm": 0.3361026346683502, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 860 + }, + { + "epoch": 0.6499813223757938, + "grad_norm": 0.3537365198135376, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 870 + }, + { + "epoch": 0.6574523720582742, + "grad_norm": 0.33854469656944275, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 880 + }, + { + "epoch": 0.6649234217407546, + "grad_norm": 0.3332272469997406, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 890 + }, + { + "epoch": 0.6723944714232349, + "grad_norm": 0.34954726696014404, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 900 + }, + { + "epoch": 0.6798655211057153, + "grad_norm": 0.2921750247478485, + "learning_rate": 0.0002, + "loss": 1.7917, + "step": 910 + }, + { + "epoch": 0.6873365707881958, + "grad_norm": 0.30508682131767273, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 920 + }, + { + "epoch": 0.6948076204706761, + "grad_norm": 0.32268425822257996, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 930 + }, + { + "epoch": 0.7022786701531565, + "grad_norm": 0.2844390869140625, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 940 + }, + { + "epoch": 0.709749719835637, + "grad_norm": 0.31263890862464905, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 950 + }, + { + "epoch": 0.7172207695181173, + "grad_norm": 0.3626808822154999, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 960 + }, + { + "epoch": 0.7246918192005977, + "grad_norm": 0.3322749733924866, + "learning_rate": 0.0002, + "loss": 1.853, + "step": 970 + }, + { + "epoch": 0.732162868883078, + "grad_norm": 0.29177871346473694, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 980 + }, + { + "epoch": 0.7396339185655585, + "grad_norm": 0.35405513644218445, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 990 + }, + { + "epoch": 0.7471049682480388, + "grad_norm": 0.39318400621414185, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1000 + }, + { + "epoch": 0.7545760179305192, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1010 + }, + { + "epoch": 0.7620470676129997, + "grad_norm": 0.3271748721599579, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 1020 + }, + { + "epoch": 0.76951811729548, + "grad_norm": 0.30883970856666565, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1030 + }, + { + "epoch": 0.7769891669779604, + "grad_norm": 0.3411838412284851, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 1040 + }, + { + "epoch": 0.7844602166604407, + "grad_norm": 0.30608129501342773, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 1050 + }, + { + "epoch": 0.7919312663429212, + "grad_norm": 0.30899080634117126, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 1060 + }, + { + "epoch": 0.7994023160254016, + "grad_norm": 0.3160453140735626, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 1070 + }, + { + "epoch": 0.8068733657078819, + "grad_norm": 0.30947187542915344, + "learning_rate": 0.0002, + "loss": 1.8452, + "step": 1080 + }, + { + "epoch": 0.8143444153903624, + "grad_norm": 0.3103134036064148, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1090 + }, + { + "epoch": 0.8218154650728428, + "grad_norm": 0.31771138310432434, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 1100 + }, + { + "epoch": 0.8292865147553231, + "grad_norm": 0.5860997438430786, + "learning_rate": 0.0002, + "loss": 1.7918, + "step": 1110 + }, + { + "epoch": 0.8367575644378035, + "grad_norm": 0.3230148255825043, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 1120 + }, + { + "epoch": 0.8442286141202839, + "grad_norm": 0.29611510038375854, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 1130 + }, + { + "epoch": 0.8516996638027643, + "grad_norm": 0.3373654782772064, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 1140 + }, + { + "epoch": 0.8591707134852447, + "grad_norm": 0.3474279046058655, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1150 + }, + { + "epoch": 0.866641763167725, + "grad_norm": 0.35057875514030457, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1160 + }, + { + "epoch": 0.8741128128502055, + "grad_norm": 0.39537495374679565, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 1170 + }, + { + "epoch": 0.8815838625326858, + "grad_norm": 0.3714233636856079, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1180 + }, + { + "epoch": 0.8890549122151662, + "grad_norm": 0.2950296998023987, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1190 + }, + { + "epoch": 0.8965259618976467, + "grad_norm": 0.38182979822158813, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 1200 + }, + { + "epoch": 0.903997011580127, + "grad_norm": 0.27883678674697876, + "learning_rate": 0.0002, + "loss": 1.827, + "step": 1210 + }, + { + "epoch": 0.9114680612626074, + "grad_norm": 0.33874374628067017, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1220 + }, + { + "epoch": 0.9189391109450877, + "grad_norm": 0.3014272153377533, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1230 + }, + { + "epoch": 0.9264101606275682, + "grad_norm": 0.3194271922111511, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 1240 + }, + { + "epoch": 0.9338812103100486, + "grad_norm": 0.3049403429031372, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1250 + }, + { + "epoch": 0.9413522599925289, + "grad_norm": 0.30621254444122314, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 1260 + }, + { + "epoch": 0.9488233096750094, + "grad_norm": 0.28675132989883423, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 1270 + }, + { + "epoch": 0.9562943593574897, + "grad_norm": 0.3322032690048218, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1280 + }, + { + "epoch": 0.9637654090399701, + "grad_norm": 0.35408294200897217, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1290 + }, + { + "epoch": 0.9712364587224505, + "grad_norm": 0.36386919021606445, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1300 + }, + { + "epoch": 0.9787075084049309, + "grad_norm": 0.32338324189186096, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 1310 + }, + { + "epoch": 0.9861785580874113, + "grad_norm": 0.3714013993740082, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 1320 + }, + { + "epoch": 0.9936496077698916, + "grad_norm": 0.3133082389831543, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 1330 + }, + { + "epoch": 0.9996264475158759, + "eval_loss": 1.8051470518112183, + "eval_runtime": 38.6332, + "eval_samples_per_second": 13.331, + "eval_steps_per_second": 1.682, + "step": 1338 + }, + { + "epoch": 1.001120657452372, + "grad_norm": 0.31595754623413086, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 1340 + }, + { + "epoch": 1.0085917071348525, + "grad_norm": 0.3095700144767761, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1350 + }, + { + "epoch": 1.0160627568173328, + "grad_norm": 0.34677496552467346, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1360 + }, + { + "epoch": 1.0235338064998132, + "grad_norm": 0.29108840227127075, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1370 + }, + { + "epoch": 1.0310048561822935, + "grad_norm": 0.32356950640678406, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 1380 + }, + { + "epoch": 1.038475905864774, + "grad_norm": 0.4200669229030609, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1390 + }, + { + "epoch": 1.0459469555472545, + "grad_norm": 0.3283711373806, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 1400 + }, + { + "epoch": 1.0534180052297348, + "grad_norm": 0.32898256182670593, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1410 + }, + { + "epoch": 1.0608890549122152, + "grad_norm": 0.38790300488471985, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 1420 + }, + { + "epoch": 1.0683601045946955, + "grad_norm": 0.339800089597702, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1430 + }, + { + "epoch": 1.075831154277176, + "grad_norm": 0.3548751175403595, + "learning_rate": 0.0002, + "loss": 1.7076, + "step": 1440 + }, + { + "epoch": 1.0833022039596563, + "grad_norm": 0.35114359855651855, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1450 + }, + { + "epoch": 1.0907732536421366, + "grad_norm": 0.35226720571517944, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 1460 + }, + { + "epoch": 1.0982443033246172, + "grad_norm": 0.33665576577186584, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 1470 + }, + { + "epoch": 1.1057153530070976, + "grad_norm": 0.363889217376709, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1480 + }, + { + "epoch": 1.113186402689578, + "grad_norm": 0.3826201856136322, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 1490 + }, + { + "epoch": 1.1206574523720583, + "grad_norm": 0.34058740735054016, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 1500 + }, + { + "epoch": 1.1281285020545386, + "grad_norm": 0.3462134301662445, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1510 + }, + { + "epoch": 1.135599551737019, + "grad_norm": 0.3396756052970886, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 1520 + }, + { + "epoch": 1.1430706014194993, + "grad_norm": 0.32004743814468384, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1530 + }, + { + "epoch": 1.15054165110198, + "grad_norm": 0.3397733271121979, + "learning_rate": 0.0002, + "loss": 1.743, + "step": 1540 + }, + { + "epoch": 1.1580127007844603, + "grad_norm": 0.3783262073993683, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 1550 + }, + { + "epoch": 1.1654837504669406, + "grad_norm": 0.35121291875839233, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1560 + }, + { + "epoch": 1.172954800149421, + "grad_norm": 0.35816895961761475, + "learning_rate": 0.0002, + "loss": 1.678, + "step": 1570 + }, + { + "epoch": 1.1804258498319014, + "grad_norm": 0.33843839168548584, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1580 + }, + { + "epoch": 1.1878968995143817, + "grad_norm": 0.3371972143650055, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 1590 + }, + { + "epoch": 1.195367949196862, + "grad_norm": 0.36016878485679626, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 1600 + }, + { + "epoch": 1.2028389988793426, + "grad_norm": 0.40879473090171814, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 1610 + }, + { + "epoch": 1.210310048561823, + "grad_norm": 0.3216715455055237, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 1620 + }, + { + "epoch": 1.2177810982443034, + "grad_norm": 0.4482610821723938, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1630 + }, + { + "epoch": 1.2252521479267837, + "grad_norm": 0.3257700502872467, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1640 + }, + { + "epoch": 1.232723197609264, + "grad_norm": 0.38646459579467773, + "learning_rate": 0.0002, + "loss": 1.7177, + "step": 1650 + }, + { + "epoch": 1.2401942472917444, + "grad_norm": 0.4081360697746277, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1660 + }, + { + "epoch": 1.2476652969742248, + "grad_norm": 0.4326848089694977, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 1670 + }, + { + "epoch": 1.2551363466567054, + "grad_norm": 0.346401572227478, + "learning_rate": 0.0002, + "loss": 1.6752, + "step": 1680 + }, + { + "epoch": 1.2626073963391857, + "grad_norm": 0.34536251425743103, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1690 + }, + { + "epoch": 1.270078446021666, + "grad_norm": 0.41359591484069824, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 1700 + }, + { + "epoch": 1.2775494957041464, + "grad_norm": 0.3530874252319336, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 1710 + }, + { + "epoch": 1.2850205453866268, + "grad_norm": 0.3702719211578369, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 1720 + }, + { + "epoch": 1.2924915950691072, + "grad_norm": 0.3703329563140869, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1730 + }, + { + "epoch": 1.2999626447515875, + "grad_norm": 0.37919729948043823, + "learning_rate": 0.0002, + "loss": 1.7221, + "step": 1740 + }, + { + "epoch": 1.307433694434068, + "grad_norm": 0.32526856660842896, + "learning_rate": 0.0002, + "loss": 1.7859, + "step": 1750 + }, + { + "epoch": 1.3149047441165485, + "grad_norm": 0.36752620339393616, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1760 + }, + { + "epoch": 1.3223757937990288, + "grad_norm": 0.3398192524909973, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1770 + }, + { + "epoch": 1.3298468434815092, + "grad_norm": 0.37435585260391235, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1780 + }, + { + "epoch": 1.3373178931639895, + "grad_norm": 0.35793280601501465, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1790 + }, + { + "epoch": 1.3447889428464699, + "grad_norm": 0.35481882095336914, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1800 + }, + { + "epoch": 1.3522599925289502, + "grad_norm": 0.3786393105983734, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1810 + }, + { + "epoch": 1.3597310422114308, + "grad_norm": 0.33245593309402466, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1820 + }, + { + "epoch": 1.3672020918939112, + "grad_norm": 0.35388344526290894, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1830 + }, + { + "epoch": 1.3746731415763915, + "grad_norm": 0.3695325553417206, + "learning_rate": 0.0002, + "loss": 1.6968, + "step": 1840 + }, + { + "epoch": 1.382144191258872, + "grad_norm": 0.3683604598045349, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1850 + }, + { + "epoch": 1.3896152409413522, + "grad_norm": 0.3753012418746948, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1860 + }, + { + "epoch": 1.3970862906238326, + "grad_norm": 0.3331069350242615, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 1870 + }, + { + "epoch": 1.404557340306313, + "grad_norm": 0.3877500295639038, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 1880 + }, + { + "epoch": 1.4120283899887935, + "grad_norm": 0.33525151014328003, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1890 + }, + { + "epoch": 1.4194994396712737, + "grad_norm": 0.3697299659252167, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1900 + }, + { + "epoch": 1.4269704893537543, + "grad_norm": 0.4029286205768585, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1910 + }, + { + "epoch": 1.4344415390362346, + "grad_norm": 0.3596203029155731, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 1920 + }, + { + "epoch": 1.441912588718715, + "grad_norm": 0.450783908367157, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 1930 + }, + { + "epoch": 1.4493836384011953, + "grad_norm": 0.3651481866836548, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1940 + }, + { + "epoch": 1.4568546880836757, + "grad_norm": 0.3608424663543701, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 1950 + }, + { + "epoch": 1.4643257377661563, + "grad_norm": 0.39684420824050903, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 1960 + }, + { + "epoch": 1.4717967874486364, + "grad_norm": 0.34618663787841797, + "learning_rate": 0.0002, + "loss": 1.7514, + "step": 1970 + }, + { + "epoch": 1.479267837131117, + "grad_norm": 0.4150386452674866, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1980 + }, + { + "epoch": 1.4867388868135973, + "grad_norm": 0.35500776767730713, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1990 + }, + { + "epoch": 1.4942099364960777, + "grad_norm": 0.344144344329834, + "learning_rate": 0.0002, + "loss": 1.7322, + "step": 2000 + }, + { + "epoch": 1.501680986178558, + "grad_norm": 0.3340149223804474, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2010 + }, + { + "epoch": 1.5091520358610384, + "grad_norm": 0.37685006856918335, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 2020 + }, + { + "epoch": 1.516623085543519, + "grad_norm": 0.3699876368045807, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 2030 + }, + { + "epoch": 1.5240941352259991, + "grad_norm": 0.3370307385921478, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 2040 + }, + { + "epoch": 1.5315651849084797, + "grad_norm": 0.37780630588531494, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 2050 + }, + { + "epoch": 1.53903623459096, + "grad_norm": 0.370259165763855, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 2060 + }, + { + "epoch": 1.5465072842734404, + "grad_norm": 0.3440011441707611, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 2070 + }, + { + "epoch": 1.5539783339559208, + "grad_norm": 0.40382063388824463, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 2080 + }, + { + "epoch": 1.5614493836384011, + "grad_norm": 0.38002029061317444, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 2090 + }, + { + "epoch": 1.5689204333208817, + "grad_norm": 0.3658451437950134, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2100 + }, + { + "epoch": 1.5763914830033618, + "grad_norm": 0.354842871427536, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 2110 + }, + { + "epoch": 1.5838625326858424, + "grad_norm": 0.34735530614852905, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 2120 + }, + { + "epoch": 1.5913335823683228, + "grad_norm": 0.377581924200058, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 2130 + }, + { + "epoch": 1.5988046320508031, + "grad_norm": 0.41254034638404846, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 2140 + }, + { + "epoch": 1.6062756817332835, + "grad_norm": 0.3630715310573578, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2150 + }, + { + "epoch": 1.6137467314157639, + "grad_norm": 0.36980143189430237, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 2160 + }, + { + "epoch": 1.6212177810982444, + "grad_norm": 0.3634769320487976, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2170 + }, + { + "epoch": 1.6286888307807246, + "grad_norm": 0.3794139623641968, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2180 + }, + { + "epoch": 1.6361598804632052, + "grad_norm": 0.359742134809494, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 2190 + }, + { + "epoch": 1.6436309301456855, + "grad_norm": 0.3770543932914734, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.6511019798281659, + "grad_norm": 0.3797036409378052, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 2210 + }, + { + "epoch": 1.6585730295106462, + "grad_norm": 0.35622093081474304, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 2220 + }, + { + "epoch": 1.6660440791931266, + "grad_norm": 0.34552520513534546, + "learning_rate": 0.0002, + "loss": 1.6615, + "step": 2230 + }, + { + "epoch": 1.6735151288756072, + "grad_norm": 0.379926860332489, + "learning_rate": 0.0002, + "loss": 1.7522, + "step": 2240 + }, + { + "epoch": 1.6809861785580873, + "grad_norm": 0.37083810567855835, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 2250 + }, + { + "epoch": 1.6884572282405679, + "grad_norm": 0.42746543884277344, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 2260 + }, + { + "epoch": 1.6959282779230482, + "grad_norm": 0.3372884690761566, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2270 + }, + { + "epoch": 1.7033993276055286, + "grad_norm": 0.35220256447792053, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2280 + }, + { + "epoch": 1.710870377288009, + "grad_norm": 0.3659130930900574, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 2290 + }, + { + "epoch": 1.7183414269704893, + "grad_norm": 0.37629297375679016, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2300 + }, + { + "epoch": 1.7258124766529699, + "grad_norm": 0.36312398314476013, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2310 + }, + { + "epoch": 1.73328352633545, + "grad_norm": 0.467709481716156, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 2320 + }, + { + "epoch": 1.7407545760179306, + "grad_norm": 0.38685527443885803, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2330 + }, + { + "epoch": 1.748225625700411, + "grad_norm": 0.3578338325023651, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 2340 + }, + { + "epoch": 1.7556966753828913, + "grad_norm": 0.36057502031326294, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2350 + }, + { + "epoch": 1.7631677250653717, + "grad_norm": 0.3615196645259857, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2360 + }, + { + "epoch": 1.770638774747852, + "grad_norm": 0.4118947684764862, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 2370 + }, + { + "epoch": 1.7781098244303326, + "grad_norm": 0.4067276120185852, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2380 + }, + { + "epoch": 1.7855808741128127, + "grad_norm": 0.3979823887348175, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2390 + }, + { + "epoch": 1.7930519237952933, + "grad_norm": 0.44045883417129517, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 2400 + }, + { + "epoch": 1.8005229734777737, + "grad_norm": 0.3998069167137146, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 2410 + }, + { + "epoch": 1.807994023160254, + "grad_norm": 0.3450094759464264, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 2420 + }, + { + "epoch": 1.8154650728427344, + "grad_norm": 0.3759009838104248, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2430 + }, + { + "epoch": 1.8229361225252148, + "grad_norm": 0.34347015619277954, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2440 + }, + { + "epoch": 1.8304071722076953, + "grad_norm": 0.3511228859424591, + "learning_rate": 0.0002, + "loss": 1.7345, + "step": 2450 + }, + { + "epoch": 1.8378782218901755, + "grad_norm": 0.36853715777397156, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 2460 + }, + { + "epoch": 1.845349271572656, + "grad_norm": 0.40659376978874207, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2470 + }, + { + "epoch": 1.8528203212551362, + "grad_norm": 0.39621320366859436, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 2480 + }, + { + "epoch": 1.8602913709376168, + "grad_norm": 0.3753979504108429, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 2490 + }, + { + "epoch": 1.8677624206200971, + "grad_norm": 0.3811938464641571, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2500 + }, + { + "epoch": 1.8752334703025775, + "grad_norm": 0.3432596027851105, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 2510 + }, + { + "epoch": 1.882704519985058, + "grad_norm": 0.3670712113380432, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 2520 + }, + { + "epoch": 1.8901755696675382, + "grad_norm": 0.40907177329063416, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2530 + }, + { + "epoch": 1.8976466193500188, + "grad_norm": 0.3821999728679657, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 2540 + }, + { + "epoch": 1.905117669032499, + "grad_norm": 0.36173978447914124, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 2550 + }, + { + "epoch": 1.9125887187149795, + "grad_norm": 0.38990336656570435, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 2560 + }, + { + "epoch": 1.9200597683974598, + "grad_norm": 0.35242322087287903, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 2570 + }, + { + "epoch": 1.9275308180799402, + "grad_norm": 0.3506428003311157, + "learning_rate": 0.0002, + "loss": 1.7268, + "step": 2580 + }, + { + "epoch": 1.9350018677624208, + "grad_norm": 0.39540135860443115, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2590 + }, + { + "epoch": 1.942472917444901, + "grad_norm": 0.3444725573062897, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2600 + }, + { + "epoch": 1.9499439671273815, + "grad_norm": 0.3963521718978882, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 2610 + }, + { + "epoch": 1.9574150168098616, + "grad_norm": 0.3689815402030945, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2620 + }, + { + "epoch": 1.9648860664923422, + "grad_norm": 0.3482626676559448, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 2630 + }, + { + "epoch": 1.9723571161748226, + "grad_norm": 0.35832616686820984, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2640 + }, + { + "epoch": 1.979828165857303, + "grad_norm": 0.4776208996772766, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2650 + }, + { + "epoch": 1.9872992155397835, + "grad_norm": 0.32570165395736694, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2660 + }, + { + "epoch": 1.9947702652222636, + "grad_norm": 0.3380725085735321, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2670 + }, + { + "epoch": 2.0, + "eval_loss": 1.8046749830245972, + "eval_runtime": 38.5096, + "eval_samples_per_second": 13.373, + "eval_steps_per_second": 1.688, + "step": 2677 + }, + { + "epoch": 2.002241314904744, + "grad_norm": 0.36817631125450134, + "learning_rate": 0.0002, + "loss": 1.7265, + "step": 2680 + }, + { + "epoch": 2.0097123645872244, + "grad_norm": 0.4056456685066223, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2690 + }, + { + "epoch": 2.017183414269705, + "grad_norm": 0.37416863441467285, + "learning_rate": 0.0002, + "loss": 1.5515, + "step": 2700 + }, + { + "epoch": 2.024654463952185, + "grad_norm": 0.4273638427257538, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2710 + }, + { + "epoch": 2.0321255136346656, + "grad_norm": 0.36497923731803894, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2720 + }, + { + "epoch": 2.0395965633171462, + "grad_norm": 0.5021994113922119, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 2730 + }, + { + "epoch": 2.0470676129996264, + "grad_norm": 0.45896220207214355, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 2740 + }, + { + "epoch": 2.054538662682107, + "grad_norm": 0.3973815143108368, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 2750 + }, + { + "epoch": 2.062009712364587, + "grad_norm": 0.4521815776824951, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2760 + }, + { + "epoch": 2.0694807620470677, + "grad_norm": 0.42775002121925354, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2770 + }, + { + "epoch": 2.076951811729548, + "grad_norm": 0.48158586025238037, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 2780 + }, + { + "epoch": 2.0844228614120284, + "grad_norm": 0.4612371623516083, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2790 + }, + { + "epoch": 2.091893911094509, + "grad_norm": 0.42536866664886475, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 2800 + }, + { + "epoch": 2.099364960776989, + "grad_norm": 0.48515772819519043, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 2810 + }, + { + "epoch": 2.1068360104594697, + "grad_norm": 0.41418662667274475, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2820 + }, + { + "epoch": 2.11430706014195, + "grad_norm": 0.4683697819709778, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2830 + }, + { + "epoch": 2.1217781098244304, + "grad_norm": 0.4484657049179077, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2840 + }, + { + "epoch": 2.1292491595069105, + "grad_norm": 0.6621400713920593, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 2850 + }, + { + "epoch": 2.136720209189391, + "grad_norm": 0.45074811577796936, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 2860 + }, + { + "epoch": 2.1441912588718717, + "grad_norm": 0.3513113558292389, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2870 + }, + { + "epoch": 2.151662308554352, + "grad_norm": 0.40411314368247986, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 2880 + }, + { + "epoch": 2.1591333582368324, + "grad_norm": 0.4121065139770508, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 2890 + }, + { + "epoch": 2.1666044079193125, + "grad_norm": 0.44723689556121826, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 2900 + }, + { + "epoch": 2.174075457601793, + "grad_norm": 0.4226122498512268, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 2910 + }, + { + "epoch": 2.1815465072842732, + "grad_norm": 0.46617650985717773, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2920 + }, + { + "epoch": 2.189017556966754, + "grad_norm": 0.4506422281265259, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 2930 + }, + { + "epoch": 2.1964886066492344, + "grad_norm": 0.4892672896385193, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2940 + }, + { + "epoch": 2.2039596563317145, + "grad_norm": 0.44095516204833984, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2950 + }, + { + "epoch": 2.211430706014195, + "grad_norm": 0.41522109508514404, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 2960 + }, + { + "epoch": 2.2189017556966752, + "grad_norm": 0.4860858917236328, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2970 + }, + { + "epoch": 2.226372805379156, + "grad_norm": 0.42662516236305237, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2980 + }, + { + "epoch": 2.233843855061636, + "grad_norm": 0.4390648305416107, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2990 + }, + { + "epoch": 2.2413149047441165, + "grad_norm": 0.47515565156936646, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 3000 + }, + { + "epoch": 2.248785954426597, + "grad_norm": 0.4104543924331665, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 3010 + }, + { + "epoch": 2.2562570041090773, + "grad_norm": 0.4404028654098511, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 3020 + }, + { + "epoch": 2.263728053791558, + "grad_norm": 0.4717366695404053, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3030 + }, + { + "epoch": 2.271199103474038, + "grad_norm": 0.48345857858657837, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 3040 + }, + { + "epoch": 2.2786701531565186, + "grad_norm": 0.5312452912330627, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 3050 + }, + { + "epoch": 2.2861412028389987, + "grad_norm": 0.5073099732398987, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 3060 + }, + { + "epoch": 2.2936122525214793, + "grad_norm": 0.5027463436126709, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 3070 + }, + { + "epoch": 2.30108330220396, + "grad_norm": 0.5436304807662964, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3080 + }, + { + "epoch": 2.30855435188644, + "grad_norm": 0.4701065123081207, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 3090 + }, + { + "epoch": 2.3160254015689206, + "grad_norm": 0.46988746523857117, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 3100 + }, + { + "epoch": 2.3234964512514007, + "grad_norm": 0.45112869143486023, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 3110 + }, + { + "epoch": 2.3309675009338813, + "grad_norm": 0.5173566937446594, + "learning_rate": 0.0002, + "loss": 1.6291, + "step": 3120 + }, + { + "epoch": 2.3384385506163614, + "grad_norm": 0.40345850586891174, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 3130 + }, + { + "epoch": 2.345909600298842, + "grad_norm": 0.4218924939632416, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3140 + }, + { + "epoch": 2.3533806499813226, + "grad_norm": 0.41857317090034485, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 3150 + }, + { + "epoch": 2.3608516996638027, + "grad_norm": 0.4197218418121338, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 3160 + }, + { + "epoch": 2.3683227493462833, + "grad_norm": 0.4260677397251129, + "learning_rate": 0.0002, + "loss": 1.6572, + "step": 3170 + }, + { + "epoch": 2.3757937990287634, + "grad_norm": 0.4209042191505432, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3180 + }, + { + "epoch": 2.383264848711244, + "grad_norm": 0.4092234969139099, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3190 + }, + { + "epoch": 2.390735898393724, + "grad_norm": 0.4928431510925293, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 3200 + }, + { + "epoch": 2.3982069480762047, + "grad_norm": 0.49252402782440186, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3210 + }, + { + "epoch": 2.4056779977586853, + "grad_norm": 0.4368397295475006, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3220 + }, + { + "epoch": 2.4131490474411654, + "grad_norm": 0.46122390031814575, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 3230 + }, + { + "epoch": 2.420620097123646, + "grad_norm": 0.4272301197052002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.428091146806126, + "grad_norm": 0.41480937600135803, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 3250 + }, + { + "epoch": 2.4355621964886067, + "grad_norm": 0.48911941051483154, + "learning_rate": 0.0002, + "loss": 1.6281, + "step": 3260 + }, + { + "epoch": 2.443033246171087, + "grad_norm": 0.4444098472595215, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 3270 + }, + { + "epoch": 2.4505042958535674, + "grad_norm": 0.5111684799194336, + "learning_rate": 0.0002, + "loss": 1.6961, + "step": 3280 + }, + { + "epoch": 2.457975345536048, + "grad_norm": 0.5058825016021729, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 3290 + }, + { + "epoch": 2.465446395218528, + "grad_norm": 0.44173210859298706, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3300 + }, + { + "epoch": 2.4729174449010087, + "grad_norm": 0.4659745991230011, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 3310 + }, + { + "epoch": 2.480388494583489, + "grad_norm": 0.47237497568130493, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3320 + }, + { + "epoch": 2.4878595442659694, + "grad_norm": 0.47303131222724915, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 3330 + }, + { + "epoch": 2.4953305939484496, + "grad_norm": 0.4522389769554138, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 3340 + }, + { + "epoch": 2.50280164363093, + "grad_norm": 0.4467332363128662, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3350 + }, + { + "epoch": 2.5102726933134107, + "grad_norm": 0.4413762092590332, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3360 + }, + { + "epoch": 2.517743742995891, + "grad_norm": 0.495514452457428, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 3370 + }, + { + "epoch": 2.5252147926783715, + "grad_norm": 0.4429773986339569, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 3380 + }, + { + "epoch": 2.5326858423608516, + "grad_norm": 0.4589079022407532, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3390 + }, + { + "epoch": 2.540156892043332, + "grad_norm": 0.4683997333049774, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 3400 + }, + { + "epoch": 2.5476279417258123, + "grad_norm": 0.4651731252670288, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 3410 + }, + { + "epoch": 2.555098991408293, + "grad_norm": 0.45818084478378296, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3420 + }, + { + "epoch": 2.5625700410907735, + "grad_norm": 0.45209529995918274, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.5700410907732536, + "grad_norm": 0.4344733655452728, + "learning_rate": 0.0002, + "loss": 1.5606, + "step": 3440 + }, + { + "epoch": 2.577512140455734, + "grad_norm": 0.47435566782951355, + "learning_rate": 0.0002, + "loss": 1.6748, + "step": 3450 + }, + { + "epoch": 2.5849831901382143, + "grad_norm": 0.43841999769210815, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 3460 + }, + { + "epoch": 2.592454239820695, + "grad_norm": 0.4323869049549103, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 3470 + }, + { + "epoch": 2.599925289503175, + "grad_norm": 0.44355881214141846, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 3480 + }, + { + "epoch": 2.6073963391856556, + "grad_norm": 0.45847779512405396, + "learning_rate": 0.0002, + "loss": 1.665, + "step": 3490 + }, + { + "epoch": 2.614867388868136, + "grad_norm": 0.4411061704158783, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3500 + }, + { + "epoch": 2.6223384385506163, + "grad_norm": 0.4446796178817749, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3510 + }, + { + "epoch": 2.629809488233097, + "grad_norm": 0.41969653964042664, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3520 + }, + { + "epoch": 2.637280537915577, + "grad_norm": 0.5263747572898865, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 3530 + }, + { + "epoch": 2.6447515875980576, + "grad_norm": 0.47719451785087585, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3540 + }, + { + "epoch": 2.6522226372805378, + "grad_norm": 0.46574118733406067, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 3550 + }, + { + "epoch": 2.6596936869630183, + "grad_norm": 0.46867135167121887, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 3560 + }, + { + "epoch": 2.667164736645499, + "grad_norm": 0.4441198706626892, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3570 + }, + { + "epoch": 2.674635786327979, + "grad_norm": 0.4871319830417633, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3580 + }, + { + "epoch": 2.6821068360104596, + "grad_norm": 0.43900373578071594, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 3590 + }, + { + "epoch": 2.6895778856929398, + "grad_norm": 0.42509549856185913, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 3600 + }, + { + "epoch": 2.6970489353754203, + "grad_norm": 0.4691086709499359, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 3610 + }, + { + "epoch": 2.7045199850579005, + "grad_norm": 0.46318942308425903, + "learning_rate": 0.0002, + "loss": 1.5491, + "step": 3620 + }, + { + "epoch": 2.711991034740381, + "grad_norm": 0.44631096720695496, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3630 + }, + { + "epoch": 2.7194620844228616, + "grad_norm": 0.42315489053726196, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3640 + }, + { + "epoch": 2.7269331341053418, + "grad_norm": 0.4971241056919098, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3650 + }, + { + "epoch": 2.7344041837878224, + "grad_norm": 0.4578486382961273, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 3660 + }, + { + "epoch": 2.7418752334703025, + "grad_norm": 0.46584776043891907, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3670 + }, + { + "epoch": 2.749346283152783, + "grad_norm": 0.4951731264591217, + "learning_rate": 0.0002, + "loss": 1.6809, + "step": 3680 + }, + { + "epoch": 2.756817332835263, + "grad_norm": 0.4935225546360016, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 3690 + }, + { + "epoch": 2.764288382517744, + "grad_norm": 0.41805586218833923, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3700 + }, + { + "epoch": 2.7717594322002244, + "grad_norm": 0.4417555630207062, + "learning_rate": 0.0002, + "loss": 1.7173, + "step": 3710 + }, + { + "epoch": 2.7792304818827045, + "grad_norm": 0.48229655623435974, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 3720 + }, + { + "epoch": 2.786701531565185, + "grad_norm": 0.48562315106391907, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3730 + }, + { + "epoch": 2.794172581247665, + "grad_norm": 0.4473940432071686, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 3740 + }, + { + "epoch": 2.801643630930146, + "grad_norm": 0.4626813232898712, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3750 + }, + { + "epoch": 2.809114680612626, + "grad_norm": 0.4339792728424072, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 3760 + }, + { + "epoch": 2.8165857302951065, + "grad_norm": 0.5250858068466187, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 3770 + }, + { + "epoch": 2.824056779977587, + "grad_norm": 0.4537523090839386, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3780 + }, + { + "epoch": 2.831527829660067, + "grad_norm": 0.5646113157272339, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3790 + }, + { + "epoch": 2.8389988793425474, + "grad_norm": 0.44243332743644714, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 3800 + }, + { + "epoch": 2.846469929025028, + "grad_norm": 0.4585791826248169, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3810 + }, + { + "epoch": 2.8539409787075085, + "grad_norm": 0.489702045917511, + "learning_rate": 0.0002, + "loss": 1.6854, + "step": 3820 + }, + { + "epoch": 2.8614120283899886, + "grad_norm": 0.502470850944519, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 3830 + }, + { + "epoch": 2.8688830780724692, + "grad_norm": 0.4395960867404938, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3840 + }, + { + "epoch": 2.87635412775495, + "grad_norm": 0.4348670244216919, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3850 + }, + { + "epoch": 2.88382517743743, + "grad_norm": 0.48852720856666565, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3860 + }, + { + "epoch": 2.89129622711991, + "grad_norm": 0.45317450165748596, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3870 + }, + { + "epoch": 2.8987672768023907, + "grad_norm": 0.4732758700847626, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3880 + }, + { + "epoch": 2.9062383264848712, + "grad_norm": 0.45238012075424194, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3890 + }, + { + "epoch": 2.9137093761673514, + "grad_norm": 0.48838064074516296, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 3900 + }, + { + "epoch": 2.921180425849832, + "grad_norm": 0.43496349453926086, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 3910 + }, + { + "epoch": 2.9286514755323125, + "grad_norm": 0.47963935136795044, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 3920 + }, + { + "epoch": 2.9361225252147927, + "grad_norm": 0.4544987976551056, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 3930 + }, + { + "epoch": 2.943593574897273, + "grad_norm": 0.4622892141342163, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 3940 + }, + { + "epoch": 2.9510646245797534, + "grad_norm": 0.47026222944259644, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 3950 + }, + { + "epoch": 2.958535674262234, + "grad_norm": 0.4549552798271179, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 3960 + }, + { + "epoch": 2.966006723944714, + "grad_norm": 0.46647515892982483, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3970 + }, + { + "epoch": 2.9734777736271947, + "grad_norm": 0.45095112919807434, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 3980 + }, + { + "epoch": 2.9809488233096753, + "grad_norm": 0.4690017104148865, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 3990 + }, + { + "epoch": 2.9884198729921554, + "grad_norm": 0.4603444039821625, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 4000 + }, + { + "epoch": 2.9958909226746355, + "grad_norm": 0.4743294417858124, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 4010 + }, + { + "epoch": 2.999626447515876, + "eval_loss": 1.8252571821212769, + "eval_runtime": 38.7853, + "eval_samples_per_second": 13.278, + "eval_steps_per_second": 1.676, + "step": 4015 + }, + { + "epoch": 3.003361972357116, + "grad_norm": 0.4919724464416504, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 4020 + }, + { + "epoch": 3.0108330220395967, + "grad_norm": 0.4747185707092285, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4030 + }, + { + "epoch": 3.018304071722077, + "grad_norm": 0.4797595143318176, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 4040 + }, + { + "epoch": 3.0257751214045574, + "grad_norm": 0.5450999140739441, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 4050 + }, + { + "epoch": 3.0332461710870375, + "grad_norm": 0.49058812856674194, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4060 + }, + { + "epoch": 3.040717220769518, + "grad_norm": 0.5219563841819763, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4070 + }, + { + "epoch": 3.0481882704519987, + "grad_norm": 0.515628457069397, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 4080 + }, + { + "epoch": 3.055659320134479, + "grad_norm": 0.6145984530448914, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 4090 + }, + { + "epoch": 3.0631303698169594, + "grad_norm": 0.6067144274711609, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 4100 + }, + { + "epoch": 3.0706014194994395, + "grad_norm": 0.5773133039474487, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4110 + }, + { + "epoch": 3.07807246918192, + "grad_norm": 0.6894241571426392, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 4120 + }, + { + "epoch": 3.0855435188644003, + "grad_norm": 0.6422514915466309, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4130 + }, + { + "epoch": 3.093014568546881, + "grad_norm": 0.6119855046272278, + "learning_rate": 0.0002, + "loss": 1.4724, + "step": 4140 + }, + { + "epoch": 3.1004856182293614, + "grad_norm": 0.5847280025482178, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 4150 + }, + { + "epoch": 3.1079566679118416, + "grad_norm": 0.5401515960693359, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4160 + }, + { + "epoch": 3.115427717594322, + "grad_norm": 0.6501587629318237, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 4170 + }, + { + "epoch": 3.1228987672768023, + "grad_norm": 0.5988039374351501, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 4180 + }, + { + "epoch": 3.130369816959283, + "grad_norm": 0.4982665181159973, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 4190 + }, + { + "epoch": 3.137840866641763, + "grad_norm": 0.5548039078712463, + "learning_rate": 0.0002, + "loss": 1.5078, + "step": 4200 + }, + { + "epoch": 3.1453119163242436, + "grad_norm": 0.5920777320861816, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 4210 + }, + { + "epoch": 3.152782966006724, + "grad_norm": 0.6965190172195435, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 4220 + }, + { + "epoch": 3.1602540156892043, + "grad_norm": 0.5196244716644287, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4230 + }, + { + "epoch": 3.167725065371685, + "grad_norm": 0.6942682266235352, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 4240 + }, + { + "epoch": 3.175196115054165, + "grad_norm": 0.5765156149864197, + "learning_rate": 0.0002, + "loss": 1.5407, + "step": 4250 + }, + { + "epoch": 3.1826671647366456, + "grad_norm": 0.5801976919174194, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 4260 + }, + { + "epoch": 3.1901382144191257, + "grad_norm": 0.6260752081871033, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4270 + }, + { + "epoch": 3.1976092641016063, + "grad_norm": 0.6610770225524902, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 4280 + }, + { + "epoch": 3.205080313784087, + "grad_norm": 0.5762143135070801, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 4290 + }, + { + "epoch": 3.212551363466567, + "grad_norm": 0.5926990509033203, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 4300 + }, + { + "epoch": 3.2200224131490476, + "grad_norm": 0.7373854517936707, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 4310 + }, + { + "epoch": 3.2274934628315277, + "grad_norm": 0.5963311195373535, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 4320 + }, + { + "epoch": 3.2349645125140083, + "grad_norm": 0.5754616856575012, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 4330 + }, + { + "epoch": 3.2424355621964884, + "grad_norm": 0.6116095781326294, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 4340 + }, + { + "epoch": 3.249906611878969, + "grad_norm": 0.6001536846160889, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 4350 + }, + { + "epoch": 3.257377661561449, + "grad_norm": 0.5270227789878845, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 4360 + }, + { + "epoch": 3.2648487112439297, + "grad_norm": 0.6666602492332458, + "learning_rate": 0.0002, + "loss": 1.5235, + "step": 4370 + }, + { + "epoch": 3.2723197609264103, + "grad_norm": 0.520310640335083, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 4380 + }, + { + "epoch": 3.2797908106088904, + "grad_norm": 0.5165975093841553, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 4390 + }, + { + "epoch": 3.287261860291371, + "grad_norm": 0.6080228686332703, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4400 + }, + { + "epoch": 3.294732909973851, + "grad_norm": 0.670122504234314, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 4410 + }, + { + "epoch": 3.3022039596563317, + "grad_norm": 0.6019457578659058, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 4420 + }, + { + "epoch": 3.309675009338812, + "grad_norm": 0.5519300103187561, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 4430 + }, + { + "epoch": 3.3171460590212924, + "grad_norm": 0.5958521962165833, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 4440 + }, + { + "epoch": 3.324617108703773, + "grad_norm": 0.5552705526351929, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4450 + }, + { + "epoch": 3.332088158386253, + "grad_norm": 0.6583784818649292, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 4460 + }, + { + "epoch": 3.3395592080687337, + "grad_norm": 0.5815939903259277, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4470 + }, + { + "epoch": 3.347030257751214, + "grad_norm": 1.3342205286026, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 4480 + }, + { + "epoch": 3.3545013074336945, + "grad_norm": 0.6341500878334045, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 4490 + }, + { + "epoch": 3.3619723571161746, + "grad_norm": 0.6384079456329346, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 4500 + }, + { + "epoch": 3.369443406798655, + "grad_norm": 0.6098346710205078, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 4510 + }, + { + "epoch": 3.3769144564811358, + "grad_norm": 0.5958296656608582, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4520 + }, + { + "epoch": 3.384385506163616, + "grad_norm": 0.6157881617546082, + "learning_rate": 0.0002, + "loss": 1.5171, + "step": 4530 + }, + { + "epoch": 3.3918565558460965, + "grad_norm": 0.5671007037162781, + "learning_rate": 0.0002, + "loss": 1.569, + "step": 4540 + }, + { + "epoch": 3.3993276055285766, + "grad_norm": 0.6203294992446899, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 4550 + }, + { + "epoch": 3.406798655211057, + "grad_norm": 0.6743317246437073, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 4560 + }, + { + "epoch": 3.4142697048935373, + "grad_norm": 0.731765627861023, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4570 + }, + { + "epoch": 3.421740754576018, + "grad_norm": 0.6285187602043152, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 4580 + }, + { + "epoch": 3.4292118042584985, + "grad_norm": 0.612680196762085, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 4590 + }, + { + "epoch": 3.4366828539409786, + "grad_norm": 0.6413681507110596, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 4600 + }, + { + "epoch": 3.444153903623459, + "grad_norm": 0.6240990161895752, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4610 + }, + { + "epoch": 3.4516249533059393, + "grad_norm": 0.5095735192298889, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4620 + }, + { + "epoch": 3.45909600298842, + "grad_norm": 0.5699611902236938, + "learning_rate": 0.0002, + "loss": 1.4906, + "step": 4630 + }, + { + "epoch": 3.4665670526709, + "grad_norm": 0.7289775609970093, + "learning_rate": 0.0002, + "loss": 1.5176, + "step": 4640 + }, + { + "epoch": 3.4740381023533806, + "grad_norm": 0.6211609840393066, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 4650 + }, + { + "epoch": 3.481509152035861, + "grad_norm": 0.5714802145957947, + "learning_rate": 0.0002, + "loss": 1.533, + "step": 4660 + }, + { + "epoch": 3.4889802017183413, + "grad_norm": 0.6287049651145935, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 4670 + }, + { + "epoch": 3.496451251400822, + "grad_norm": 0.5480595827102661, + "learning_rate": 0.0002, + "loss": 1.4212, + "step": 4680 + }, + { + "epoch": 3.503922301083302, + "grad_norm": 0.5683253407478333, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4690 + }, + { + "epoch": 3.5113933507657826, + "grad_norm": 0.601140558719635, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4700 + }, + { + "epoch": 3.5188644004482628, + "grad_norm": 0.5344498157501221, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 4710 + }, + { + "epoch": 3.5263354501307433, + "grad_norm": 0.5739690661430359, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4720 + }, + { + "epoch": 3.533806499813224, + "grad_norm": 0.5640085935592651, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 4730 + }, + { + "epoch": 3.541277549495704, + "grad_norm": 0.5967805981636047, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 4740 + }, + { + "epoch": 3.5487485991781846, + "grad_norm": 0.6138835549354553, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4750 + }, + { + "epoch": 3.5562196488606648, + "grad_norm": 0.6779900193214417, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 4760 + }, + { + "epoch": 3.5636906985431454, + "grad_norm": 0.6122010350227356, + "learning_rate": 0.0002, + "loss": 1.4917, + "step": 4770 + }, + { + "epoch": 3.5711617482256255, + "grad_norm": 0.5685241222381592, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4780 + }, + { + "epoch": 3.578632797908106, + "grad_norm": 0.604583203792572, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 4790 + }, + { + "epoch": 3.5861038475905866, + "grad_norm": 0.651165246963501, + "learning_rate": 0.0002, + "loss": 1.4514, + "step": 4800 + }, + { + "epoch": 3.593574897273067, + "grad_norm": 0.6398511528968811, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 4810 + }, + { + "epoch": 3.6010459469555474, + "grad_norm": 0.6444641351699829, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4820 + }, + { + "epoch": 3.6085169966380275, + "grad_norm": 0.6018481850624084, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 4830 + }, + { + "epoch": 3.615988046320508, + "grad_norm": 0.6025291085243225, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 4840 + }, + { + "epoch": 3.623459096002988, + "grad_norm": 0.6810156106948853, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 4850 + }, + { + "epoch": 3.630930145685469, + "grad_norm": 0.6408044695854187, + "learning_rate": 0.0002, + "loss": 1.5299, + "step": 4860 + }, + { + "epoch": 3.6384011953679494, + "grad_norm": 0.5608272552490234, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4870 + }, + { + "epoch": 3.6458722450504295, + "grad_norm": 0.6136814951896667, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 4880 + }, + { + "epoch": 3.65334329473291, + "grad_norm": 0.5927900075912476, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4890 + }, + { + "epoch": 3.66081434441539, + "grad_norm": 0.5336901545524597, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 4900 + }, + { + "epoch": 3.668285394097871, + "grad_norm": 0.7823320627212524, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 4910 + }, + { + "epoch": 3.675756443780351, + "grad_norm": 0.6703504323959351, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 4920 + }, + { + "epoch": 3.6832274934628315, + "grad_norm": 0.6061160564422607, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 4930 + }, + { + "epoch": 3.690698543145312, + "grad_norm": 0.6237227916717529, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4940 + }, + { + "epoch": 3.6981695928277922, + "grad_norm": 0.5985278487205505, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 4950 + }, + { + "epoch": 3.705640642510273, + "grad_norm": 0.6483839750289917, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 4960 + }, + { + "epoch": 3.713111692192753, + "grad_norm": 0.5788805484771729, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 4970 + }, + { + "epoch": 3.7205827418752335, + "grad_norm": 0.5609974265098572, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 4980 + }, + { + "epoch": 3.7280537915577137, + "grad_norm": 0.5681300759315491, + "learning_rate": 0.0002, + "loss": 1.4759, + "step": 4990 + }, + { + "epoch": 3.7355248412401942, + "grad_norm": 0.5860186219215393, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 5000 + }, + { + "epoch": 3.742995890922675, + "grad_norm": 0.5718157291412354, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 5010 + }, + { + "epoch": 3.750466940605155, + "grad_norm": 0.6173721551895142, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 5020 + }, + { + "epoch": 3.7579379902876355, + "grad_norm": 0.629152238368988, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 5030 + }, + { + "epoch": 3.7654090399701157, + "grad_norm": 0.5666284561157227, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 5040 + }, + { + "epoch": 3.7728800896525962, + "grad_norm": 0.6053005456924438, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5050 + }, + { + "epoch": 3.7803511393350764, + "grad_norm": 0.5870583057403564, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 5060 + }, + { + "epoch": 3.787822189017557, + "grad_norm": 0.5422009229660034, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5070 + }, + { + "epoch": 3.7952932387000375, + "grad_norm": 0.5396918058395386, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 5080 + }, + { + "epoch": 3.8027642883825177, + "grad_norm": 0.5544713139533997, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 5090 + }, + { + "epoch": 3.8102353380649983, + "grad_norm": 0.5983749628067017, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5100 + }, + { + "epoch": 3.8177063877474784, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 5110 + }, + { + "epoch": 3.825177437429959, + "grad_norm": 0.5436882376670837, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 5120 + }, + { + "epoch": 3.832648487112439, + "grad_norm": 0.5453617572784424, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 5130 + }, + { + "epoch": 3.8401195367949197, + "grad_norm": 0.6269069314002991, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 5140 + }, + { + "epoch": 3.8475905864774003, + "grad_norm": 0.6189185380935669, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 5150 + }, + { + "epoch": 3.8550616361598804, + "grad_norm": 0.6653388142585754, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 5160 + }, + { + "epoch": 3.862532685842361, + "grad_norm": 0.5771768689155579, + "learning_rate": 0.0002, + "loss": 1.5075, + "step": 5170 + }, + { + "epoch": 3.870003735524841, + "grad_norm": 0.6052790880203247, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5180 + }, + { + "epoch": 3.8774747852073217, + "grad_norm": 0.6572316884994507, + "learning_rate": 0.0002, + "loss": 1.4987, + "step": 5190 + }, + { + "epoch": 3.884945834889802, + "grad_norm": 0.670576810836792, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 5200 + }, + { + "epoch": 3.8924168845722824, + "grad_norm": 0.5728798508644104, + "learning_rate": 0.0002, + "loss": 1.4777, + "step": 5210 + }, + { + "epoch": 3.899887934254763, + "grad_norm": 0.6340774297714233, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 5220 + }, + { + "epoch": 3.907358983937243, + "grad_norm": 0.5981315970420837, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 5230 + }, + { + "epoch": 3.9148300336197237, + "grad_norm": 0.6212025880813599, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 5240 + }, + { + "epoch": 3.922301083302204, + "grad_norm": 0.6202296018600464, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5250 + }, + { + "epoch": 3.9297721329846844, + "grad_norm": 0.6159142255783081, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 5260 + }, + { + "epoch": 3.9372431826671646, + "grad_norm": 0.6519438624382019, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 5270 + }, + { + "epoch": 3.944714232349645, + "grad_norm": 0.539813756942749, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5280 + }, + { + "epoch": 3.9521852820321257, + "grad_norm": 0.6443665027618408, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 5290 + }, + { + "epoch": 3.959656331714606, + "grad_norm": 0.6635757684707642, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 5300 + }, + { + "epoch": 3.9671273813970864, + "grad_norm": 0.589363157749176, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 5310 + }, + { + "epoch": 3.9745984310795666, + "grad_norm": 0.5788735747337341, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 5320 + }, + { + "epoch": 3.982069480762047, + "grad_norm": 0.5976864695549011, + "learning_rate": 0.0002, + "loss": 1.5607, + "step": 5330 + }, + { + "epoch": 3.9895405304445273, + "grad_norm": 0.6624067425727844, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 5340 + }, + { + "epoch": 3.997011580127008, + "grad_norm": 0.6738956570625305, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 5350 + }, + { + "epoch": 4.0, + "eval_loss": 1.868006944656372, + "eval_runtime": 38.5153, + "eval_samples_per_second": 13.371, + "eval_steps_per_second": 1.688, + "step": 5354 + }, + { + "epoch": 4.004482629809488, + "grad_norm": 0.6023468971252441, + "learning_rate": 0.0002, + "loss": 1.4535, + "step": 5360 + }, + { + "epoch": 4.011953679491969, + "grad_norm": 0.8589285612106323, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 5370 + }, + { + "epoch": 4.019424729174449, + "grad_norm": 0.7477491497993469, + "learning_rate": 0.0002, + "loss": 1.3952, + "step": 5380 + }, + { + "epoch": 4.02689577885693, + "grad_norm": 0.7601922154426575, + "learning_rate": 0.0002, + "loss": 1.3745, + "step": 5390 + }, + { + "epoch": 4.03436682853941, + "grad_norm": 0.8115614056587219, + "learning_rate": 0.0002, + "loss": 1.4133, + "step": 5400 + }, + { + "epoch": 4.04183787822189, + "grad_norm": 0.669925332069397, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 5410 + }, + { + "epoch": 4.04930892790437, + "grad_norm": 0.8091904520988464, + "learning_rate": 0.0002, + "loss": 1.2835, + "step": 5420 + }, + { + "epoch": 4.056779977586851, + "grad_norm": 0.709405779838562, + "learning_rate": 0.0002, + "loss": 1.3615, + "step": 5430 + }, + { + "epoch": 4.064251027269331, + "grad_norm": 1.0006179809570312, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 5440 + }, + { + "epoch": 4.071722076951811, + "grad_norm": 0.7017965912818909, + "learning_rate": 0.0002, + "loss": 1.3491, + "step": 5450 + }, + { + "epoch": 4.0791931266342925, + "grad_norm": 0.8991572260856628, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 5460 + }, + { + "epoch": 4.086664176316773, + "grad_norm": 0.9064797759056091, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 5470 + }, + { + "epoch": 4.094135225999253, + "grad_norm": 0.7981749176979065, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 5480 + }, + { + "epoch": 4.101606275681733, + "grad_norm": 0.7280883193016052, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 5490 + }, + { + "epoch": 4.109077325364214, + "grad_norm": 0.7419600486755371, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 5500 + }, + { + "epoch": 4.116548375046694, + "grad_norm": 0.8019949197769165, + "learning_rate": 0.0002, + "loss": 1.3199, + "step": 5510 + }, + { + "epoch": 4.124019424729174, + "grad_norm": 0.7501229047775269, + "learning_rate": 0.0002, + "loss": 1.3133, + "step": 5520 + }, + { + "epoch": 4.131490474411655, + "grad_norm": 0.8166249990463257, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 5530 + }, + { + "epoch": 4.138961524094135, + "grad_norm": 0.9728496074676514, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 5540 + }, + { + "epoch": 4.1464325737766154, + "grad_norm": 0.7590922117233276, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 5550 + }, + { + "epoch": 4.153903623459096, + "grad_norm": 0.7759010791778564, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 5560 + }, + { + "epoch": 4.161374673141577, + "grad_norm": 0.9057986736297607, + "learning_rate": 0.0002, + "loss": 1.3635, + "step": 5570 + }, + { + "epoch": 4.168845722824057, + "grad_norm": 0.8853937983512878, + "learning_rate": 0.0002, + "loss": 1.4152, + "step": 5580 + }, + { + "epoch": 4.176316772506537, + "grad_norm": 0.7070684432983398, + "learning_rate": 0.0002, + "loss": 1.3633, + "step": 5590 + }, + { + "epoch": 4.183787822189018, + "grad_norm": 0.7649410963058472, + "learning_rate": 0.0002, + "loss": 1.3218, + "step": 5600 + }, + { + "epoch": 4.191258871871498, + "grad_norm": 1.2048029899597168, + "learning_rate": 0.0002, + "loss": 1.3857, + "step": 5610 + }, + { + "epoch": 4.198729921553978, + "grad_norm": 0.7986605763435364, + "learning_rate": 0.0002, + "loss": 1.3629, + "step": 5620 + }, + { + "epoch": 4.206200971236458, + "grad_norm": 0.8151885867118835, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 5630 + }, + { + "epoch": 4.213672020918939, + "grad_norm": 0.7719064354896545, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 5640 + }, + { + "epoch": 4.2211430706014195, + "grad_norm": 0.8422448039054871, + "learning_rate": 0.0002, + "loss": 1.3852, + "step": 5650 + }, + { + "epoch": 4.2286141202839, + "grad_norm": 0.7017164826393127, + "learning_rate": 0.0002, + "loss": 1.3321, + "step": 5660 + }, + { + "epoch": 4.236085169966381, + "grad_norm": 0.8559677600860596, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 5670 + }, + { + "epoch": 4.243556219648861, + "grad_norm": 0.8216157555580139, + "learning_rate": 0.0002, + "loss": 1.3701, + "step": 5680 + }, + { + "epoch": 4.251027269331341, + "grad_norm": 0.7681755423545837, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 5690 + }, + { + "epoch": 4.258498319013821, + "grad_norm": 0.811665952205658, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 5700 + }, + { + "epoch": 4.265969368696302, + "grad_norm": 0.7242204546928406, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 5710 + }, + { + "epoch": 4.273440418378782, + "grad_norm": 0.7570181488990784, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 5720 + }, + { + "epoch": 4.280911468061262, + "grad_norm": 0.8951969146728516, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 5730 + }, + { + "epoch": 4.288382517743743, + "grad_norm": 0.7222902178764343, + "learning_rate": 0.0002, + "loss": 1.3895, + "step": 5740 + }, + { + "epoch": 4.2958535674262235, + "grad_norm": 0.8508469462394714, + "learning_rate": 0.0002, + "loss": 1.4155, + "step": 5750 + }, + { + "epoch": 4.303324617108704, + "grad_norm": 0.7215430736541748, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 5760 + }, + { + "epoch": 4.310795666791184, + "grad_norm": 0.8774884939193726, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 5770 + }, + { + "epoch": 4.318266716473665, + "grad_norm": 0.8354552984237671, + "learning_rate": 0.0002, + "loss": 1.427, + "step": 5780 + }, + { + "epoch": 4.325737766156145, + "grad_norm": 0.6938814520835876, + "learning_rate": 0.0002, + "loss": 1.3222, + "step": 5790 + }, + { + "epoch": 4.333208815838625, + "grad_norm": 0.78675377368927, + "learning_rate": 0.0002, + "loss": 1.3589, + "step": 5800 + }, + { + "epoch": 4.340679865521106, + "grad_norm": 0.7147697806358337, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 5810 + }, + { + "epoch": 4.348150915203586, + "grad_norm": 0.7693623304367065, + "learning_rate": 0.0002, + "loss": 1.3597, + "step": 5820 + }, + { + "epoch": 4.355621964886066, + "grad_norm": 0.856517493724823, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 5830 + }, + { + "epoch": 4.3630930145685465, + "grad_norm": 0.7200973033905029, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 5840 + }, + { + "epoch": 4.3705640642510275, + "grad_norm": 0.743281364440918, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 5850 + }, + { + "epoch": 4.378035113933508, + "grad_norm": 0.7627727389335632, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 5860 + }, + { + "epoch": 4.385506163615988, + "grad_norm": 0.7238836884498596, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 5870 + }, + { + "epoch": 4.392977213298469, + "grad_norm": 0.7253410816192627, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5880 + }, + { + "epoch": 4.400448262980949, + "grad_norm": 0.8232238292694092, + "learning_rate": 0.0002, + "loss": 1.3774, + "step": 5890 + }, + { + "epoch": 4.407919312663429, + "grad_norm": 0.8778504729270935, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 5900 + }, + { + "epoch": 4.415390362345909, + "grad_norm": 0.7639474868774414, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5910 + }, + { + "epoch": 4.42286141202839, + "grad_norm": 0.7666519284248352, + "learning_rate": 0.0002, + "loss": 1.3862, + "step": 5920 + }, + { + "epoch": 4.43033246171087, + "grad_norm": 0.867132842540741, + "learning_rate": 0.0002, + "loss": 1.4168, + "step": 5930 + }, + { + "epoch": 4.4378035113933505, + "grad_norm": 0.7571166753768921, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 5940 + }, + { + "epoch": 4.4452745610758315, + "grad_norm": 0.7911370992660522, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 5950 + }, + { + "epoch": 4.452745610758312, + "grad_norm": 0.8844250440597534, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 5960 + }, + { + "epoch": 4.460216660440792, + "grad_norm": 0.7336231470108032, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 5970 + }, + { + "epoch": 4.467687710123272, + "grad_norm": 0.8162738084793091, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 5980 + }, + { + "epoch": 4.475158759805753, + "grad_norm": 0.7413017153739929, + "learning_rate": 0.0002, + "loss": 1.393, + "step": 5990 + }, + { + "epoch": 4.482629809488233, + "grad_norm": 0.7215432524681091, + "learning_rate": 0.0002, + "loss": 1.3712, + "step": 6000 + }, + { + "epoch": 4.490100859170713, + "grad_norm": 0.8943389058113098, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 6010 + }, + { + "epoch": 4.497571908853194, + "grad_norm": 0.7850823998451233, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 6020 + }, + { + "epoch": 4.505042958535674, + "grad_norm": 0.8117504119873047, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 6030 + }, + { + "epoch": 4.5125140082181545, + "grad_norm": 0.8381605744361877, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 6040 + }, + { + "epoch": 4.519985057900635, + "grad_norm": 0.7964059710502625, + "learning_rate": 0.0002, + "loss": 1.3829, + "step": 6050 + }, + { + "epoch": 4.527456107583116, + "grad_norm": 0.7935128211975098, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 6060 + }, + { + "epoch": 4.534927157265596, + "grad_norm": 0.8725124597549438, + "learning_rate": 0.0002, + "loss": 1.3994, + "step": 6070 + }, + { + "epoch": 4.542398206948076, + "grad_norm": 0.880325198173523, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6080 + }, + { + "epoch": 4.549869256630557, + "grad_norm": 0.7220637202262878, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 6090 + }, + { + "epoch": 4.557340306313037, + "grad_norm": 0.6908547878265381, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 6100 + }, + { + "epoch": 4.564811355995517, + "grad_norm": 0.797931969165802, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 6110 + }, + { + "epoch": 4.572282405677997, + "grad_norm": 0.7056134343147278, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 6120 + }, + { + "epoch": 4.579753455360478, + "grad_norm": 0.7850478887557983, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 6130 + }, + { + "epoch": 4.5872245050429585, + "grad_norm": 0.8112621307373047, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 6140 + }, + { + "epoch": 4.594695554725439, + "grad_norm": 0.7040849328041077, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 6150 + }, + { + "epoch": 4.60216660440792, + "grad_norm": 0.7214553952217102, + "learning_rate": 0.0002, + "loss": 1.3526, + "step": 6160 + }, + { + "epoch": 4.6096376540904, + "grad_norm": 0.8616511821746826, + "learning_rate": 0.0002, + "loss": 1.3932, + "step": 6170 + }, + { + "epoch": 4.61710870377288, + "grad_norm": 0.8374658226966858, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 6180 + }, + { + "epoch": 4.62457975345536, + "grad_norm": 0.6761606931686401, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 6190 + }, + { + "epoch": 4.632050803137841, + "grad_norm": 0.768028199672699, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 6200 + }, + { + "epoch": 4.639521852820321, + "grad_norm": 0.9372717142105103, + "learning_rate": 0.0002, + "loss": 1.3772, + "step": 6210 + }, + { + "epoch": 4.646992902502801, + "grad_norm": 0.7906546592712402, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 6220 + }, + { + "epoch": 4.654463952185282, + "grad_norm": 0.7376723289489746, + "learning_rate": 0.0002, + "loss": 1.3962, + "step": 6230 + }, + { + "epoch": 4.6619350018677626, + "grad_norm": 0.8972630500793457, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 6240 + }, + { + "epoch": 4.669406051550243, + "grad_norm": 0.8261756300926208, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 6250 + }, + { + "epoch": 4.676877101232723, + "grad_norm": 0.7512393593788147, + "learning_rate": 0.0002, + "loss": 1.3267, + "step": 6260 + }, + { + "epoch": 4.684348150915204, + "grad_norm": 0.7132362127304077, + "learning_rate": 0.0002, + "loss": 1.4278, + "step": 6270 + }, + { + "epoch": 4.691819200597684, + "grad_norm": 0.7690575122833252, + "learning_rate": 0.0002, + "loss": 1.4299, + "step": 6280 + }, + { + "epoch": 4.699290250280164, + "grad_norm": 0.9886258840560913, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 6290 + }, + { + "epoch": 4.706761299962645, + "grad_norm": 0.9502435922622681, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 6300 + }, + { + "epoch": 4.714232349645125, + "grad_norm": 0.702255129814148, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 6310 + }, + { + "epoch": 4.721703399327605, + "grad_norm": 0.7713103890419006, + "learning_rate": 0.0002, + "loss": 1.4447, + "step": 6320 + }, + { + "epoch": 4.7291744490100855, + "grad_norm": 0.7778580784797668, + "learning_rate": 0.0002, + "loss": 1.4392, + "step": 6330 + }, + { + "epoch": 4.736645498692567, + "grad_norm": 0.7275111079216003, + "learning_rate": 0.0002, + "loss": 1.4169, + "step": 6340 + }, + { + "epoch": 4.744116548375047, + "grad_norm": 0.7728744149208069, + "learning_rate": 0.0002, + "loss": 1.4429, + "step": 6350 + }, + { + "epoch": 4.751587598057527, + "grad_norm": 0.9724260568618774, + "learning_rate": 0.0002, + "loss": 1.3756, + "step": 6360 + }, + { + "epoch": 4.759058647740007, + "grad_norm": 0.7505622506141663, + "learning_rate": 0.0002, + "loss": 1.3358, + "step": 6370 + }, + { + "epoch": 4.766529697422488, + "grad_norm": 0.7994682788848877, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 6380 + }, + { + "epoch": 4.774000747104968, + "grad_norm": 0.8432038426399231, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 6390 + }, + { + "epoch": 4.781471796787448, + "grad_norm": 0.7436022758483887, + "learning_rate": 0.0002, + "loss": 1.4606, + "step": 6400 + }, + { + "epoch": 4.788942846469929, + "grad_norm": 0.7709194421768188, + "learning_rate": 0.0002, + "loss": 1.3461, + "step": 6410 + }, + { + "epoch": 4.796413896152409, + "grad_norm": 0.8798436522483826, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 6420 + }, + { + "epoch": 4.80388494583489, + "grad_norm": 0.790189266204834, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 6430 + }, + { + "epoch": 4.811355995517371, + "grad_norm": 0.6824303865432739, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 6440 + }, + { + "epoch": 4.818827045199851, + "grad_norm": 0.7501044869422913, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 6450 + }, + { + "epoch": 4.826298094882331, + "grad_norm": 0.8840398192405701, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 6460 + }, + { + "epoch": 4.833769144564811, + "grad_norm": 0.7812688946723938, + "learning_rate": 0.0002, + "loss": 1.4412, + "step": 6470 + }, + { + "epoch": 4.841240194247292, + "grad_norm": 0.7429926991462708, + "learning_rate": 0.0002, + "loss": 1.4299, + "step": 6480 + }, + { + "epoch": 4.848711243929772, + "grad_norm": 0.7778021693229675, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 6490 + }, + { + "epoch": 4.856182293612252, + "grad_norm": 0.8270702362060547, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 6500 + }, + { + "epoch": 4.863653343294732, + "grad_norm": 0.6960513591766357, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 6510 + }, + { + "epoch": 4.8711243929772134, + "grad_norm": 0.7728942632675171, + "learning_rate": 0.0002, + "loss": 1.376, + "step": 6520 + }, + { + "epoch": 4.878595442659694, + "grad_norm": 0.7377303838729858, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 6530 + }, + { + "epoch": 4.886066492342174, + "grad_norm": 0.7257253527641296, + "learning_rate": 0.0002, + "loss": 1.3846, + "step": 6540 + }, + { + "epoch": 4.893537542024655, + "grad_norm": 0.7875821590423584, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 6550 + }, + { + "epoch": 4.901008591707135, + "grad_norm": 0.8346304297447205, + "learning_rate": 0.0002, + "loss": 1.357, + "step": 6560 + }, + { + "epoch": 4.908479641389615, + "grad_norm": 0.7710739374160767, + "learning_rate": 0.0002, + "loss": 1.4522, + "step": 6570 + }, + { + "epoch": 4.915950691072096, + "grad_norm": 0.7015138268470764, + "learning_rate": 0.0002, + "loss": 1.4465, + "step": 6580 + }, + { + "epoch": 4.923421740754576, + "grad_norm": 0.8707432150840759, + "learning_rate": 0.0002, + "loss": 1.435, + "step": 6590 + }, + { + "epoch": 4.930892790437056, + "grad_norm": 0.786601185798645, + "learning_rate": 0.0002, + "loss": 1.2968, + "step": 6600 + }, + { + "epoch": 4.938363840119536, + "grad_norm": 0.978519082069397, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 6610 + }, + { + "epoch": 4.9458348898020175, + "grad_norm": 0.8102927207946777, + "learning_rate": 0.0002, + "loss": 1.3997, + "step": 6620 + }, + { + "epoch": 4.953305939484498, + "grad_norm": 0.7628704309463501, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 6630 + }, + { + "epoch": 4.960776989166978, + "grad_norm": 0.8053455352783203, + "learning_rate": 0.0002, + "loss": 1.3774, + "step": 6640 + }, + { + "epoch": 4.968248038849458, + "grad_norm": 0.8680412173271179, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 6650 + }, + { + "epoch": 4.975719088531939, + "grad_norm": 0.7415758371353149, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 6660 + }, + { + "epoch": 4.983190138214419, + "grad_norm": 0.7730312347412109, + "learning_rate": 0.0002, + "loss": 1.3793, + "step": 6670 + }, + { + "epoch": 4.990661187896899, + "grad_norm": 0.7924041152000427, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 6680 + }, + { + "epoch": 4.99813223757938, + "grad_norm": 0.8677893877029419, + "learning_rate": 0.0002, + "loss": 1.4137, + "step": 6690 + }, + { + "epoch": 4.999626447515876, + "eval_loss": 1.9444633722305298, + "eval_runtime": 39.3488, + "eval_samples_per_second": 13.088, + "eval_steps_per_second": 1.652, + "step": 6692 + }, + { + "epoch": 5.00560328726186, + "grad_norm": 0.7102245092391968, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 6700 + }, + { + "epoch": 5.0130743369443405, + "grad_norm": 1.0425463914871216, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 6710 + }, + { + "epoch": 5.0205453866268215, + "grad_norm": 0.9320756793022156, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 6720 + }, + { + "epoch": 5.028016436309302, + "grad_norm": 0.8797217607498169, + "learning_rate": 0.0002, + "loss": 1.1786, + "step": 6730 + }, + { + "epoch": 5.035487485991782, + "grad_norm": 2.135707139968872, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 6740 + }, + { + "epoch": 5.042958535674262, + "grad_norm": 0.8747734427452087, + "learning_rate": 0.0002, + "loss": 1.1761, + "step": 6750 + }, + { + "epoch": 5.050429585356743, + "grad_norm": 0.9981076717376709, + "learning_rate": 0.0002, + "loss": 1.1675, + "step": 6760 + }, + { + "epoch": 5.057900635039223, + "grad_norm": 0.985078752040863, + "learning_rate": 0.0002, + "loss": 1.1976, + "step": 6770 + }, + { + "epoch": 5.065371684721703, + "grad_norm": 1.0974019765853882, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 6780 + }, + { + "epoch": 5.072842734404184, + "grad_norm": 0.9823219180107117, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6790 + }, + { + "epoch": 5.080313784086664, + "grad_norm": 1.122605562210083, + "learning_rate": 0.0002, + "loss": 1.2586, + "step": 6800 + }, + { + "epoch": 5.0877848337691445, + "grad_norm": 0.8556802272796631, + "learning_rate": 0.0002, + "loss": 1.2069, + "step": 6810 + }, + { + "epoch": 5.095255883451625, + "grad_norm": 1.1699262857437134, + "learning_rate": 0.0002, + "loss": 1.1908, + "step": 6820 + }, + { + "epoch": 5.102726933134106, + "grad_norm": 1.0440590381622314, + "learning_rate": 0.0002, + "loss": 1.1869, + "step": 6830 + }, + { + "epoch": 5.110197982816586, + "grad_norm": 1.0445300340652466, + "learning_rate": 0.0002, + "loss": 1.1655, + "step": 6840 + }, + { + "epoch": 5.117669032499066, + "grad_norm": 0.8289563059806824, + "learning_rate": 0.0002, + "loss": 1.2392, + "step": 6850 + }, + { + "epoch": 5.125140082181547, + "grad_norm": 1.1051193475723267, + "learning_rate": 0.0002, + "loss": 1.1687, + "step": 6860 + }, + { + "epoch": 5.132611131864027, + "grad_norm": 0.9345614910125732, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 6870 + }, + { + "epoch": 5.140082181546507, + "grad_norm": 1.1222996711730957, + "learning_rate": 0.0002, + "loss": 1.3021, + "step": 6880 + }, + { + "epoch": 5.147553231228987, + "grad_norm": 0.9405338764190674, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 6890 + }, + { + "epoch": 5.155024280911468, + "grad_norm": 1.0935171842575073, + "learning_rate": 0.0002, + "loss": 1.2367, + "step": 6900 + }, + { + "epoch": 5.1624953305939485, + "grad_norm": 1.0438612699508667, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 6910 + }, + { + "epoch": 5.169966380276429, + "grad_norm": 1.1189004182815552, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6920 + }, + { + "epoch": 5.17743742995891, + "grad_norm": 1.0533215999603271, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 6930 + }, + { + "epoch": 5.18490847964139, + "grad_norm": 0.9779648780822754, + "learning_rate": 0.0002, + "loss": 1.2974, + "step": 6940 + }, + { + "epoch": 5.19237952932387, + "grad_norm": 0.8920868635177612, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 6950 + }, + { + "epoch": 5.19985057900635, + "grad_norm": 0.8374548554420471, + "learning_rate": 0.0002, + "loss": 1.283, + "step": 6960 + }, + { + "epoch": 5.207321628688831, + "grad_norm": 1.0490682125091553, + "learning_rate": 0.0002, + "loss": 1.2775, + "step": 6970 + }, + { + "epoch": 5.214792678371311, + "grad_norm": 0.9658287167549133, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 6980 + }, + { + "epoch": 5.222263728053791, + "grad_norm": 0.9652056097984314, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 6990 + }, + { + "epoch": 5.229734777736272, + "grad_norm": 0.9141794443130493, + "learning_rate": 0.0002, + "loss": 1.3023, + "step": 7000 + }, + { + "epoch": 5.2372058274187525, + "grad_norm": 0.9831376671791077, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 7010 + }, + { + "epoch": 5.244676877101233, + "grad_norm": 1.0198718309402466, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 7020 + }, + { + "epoch": 5.252147926783713, + "grad_norm": 0.9647888541221619, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 7030 + }, + { + "epoch": 5.259618976466194, + "grad_norm": 1.3941649198532104, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 7040 + }, + { + "epoch": 5.267090026148674, + "grad_norm": 1.0305466651916504, + "learning_rate": 0.0002, + "loss": 1.2885, + "step": 7050 + }, + { + "epoch": 5.274561075831154, + "grad_norm": 0.9577859044075012, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 7060 + }, + { + "epoch": 5.282032125513634, + "grad_norm": 1.149092197418213, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 7070 + }, + { + "epoch": 5.289503175196115, + "grad_norm": 1.2582733631134033, + "learning_rate": 0.0002, + "loss": 1.2986, + "step": 7080 + }, + { + "epoch": 5.296974224878595, + "grad_norm": 1.1777442693710327, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 7090 + }, + { + "epoch": 5.3044452745610755, + "grad_norm": 1.0076404809951782, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 7100 + }, + { + "epoch": 5.3119163242435565, + "grad_norm": 0.9037365913391113, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 7110 + }, + { + "epoch": 5.319387373926037, + "grad_norm": 0.9428724646568298, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 7120 + }, + { + "epoch": 5.326858423608517, + "grad_norm": 0.9935154318809509, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 7130 + }, + { + "epoch": 5.334329473290998, + "grad_norm": 1.087500810623169, + "learning_rate": 0.0002, + "loss": 1.2833, + "step": 7140 + }, + { + "epoch": 5.341800522973478, + "grad_norm": 0.8543072938919067, + "learning_rate": 0.0002, + "loss": 1.2304, + "step": 7150 + }, + { + "epoch": 5.349271572655958, + "grad_norm": 0.9323700070381165, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 7160 + }, + { + "epoch": 5.356742622338438, + "grad_norm": 1.0037827491760254, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 7170 + }, + { + "epoch": 5.364213672020919, + "grad_norm": 0.8746469616889954, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 7180 + }, + { + "epoch": 5.371684721703399, + "grad_norm": 0.9516328573226929, + "learning_rate": 0.0002, + "loss": 1.2759, + "step": 7190 + }, + { + "epoch": 5.3791557713858795, + "grad_norm": 0.9395177364349365, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 7200 + }, + { + "epoch": 5.38662682106836, + "grad_norm": 1.000369906425476, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 7210 + }, + { + "epoch": 5.394097870750841, + "grad_norm": 1.0845502614974976, + "learning_rate": 0.0002, + "loss": 1.2337, + "step": 7220 + }, + { + "epoch": 5.401568920433321, + "grad_norm": 0.8975145220756531, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 7230 + }, + { + "epoch": 5.409039970115801, + "grad_norm": 1.040077805519104, + "learning_rate": 0.0002, + "loss": 1.2306, + "step": 7240 + }, + { + "epoch": 5.416511019798282, + "grad_norm": 1.0729942321777344, + "learning_rate": 0.0002, + "loss": 1.2277, + "step": 7250 + }, + { + "epoch": 5.423982069480762, + "grad_norm": 0.8322232961654663, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 7260 + }, + { + "epoch": 5.431453119163242, + "grad_norm": 1.0654641389846802, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 7270 + }, + { + "epoch": 5.438924168845723, + "grad_norm": 1.0445852279663086, + "learning_rate": 0.0002, + "loss": 1.268, + "step": 7280 + }, + { + "epoch": 5.446395218528203, + "grad_norm": 1.0762956142425537, + "learning_rate": 0.0002, + "loss": 1.2743, + "step": 7290 + }, + { + "epoch": 5.4538662682106835, + "grad_norm": 0.9721953868865967, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 7300 + }, + { + "epoch": 5.461337317893164, + "grad_norm": 0.9238539338111877, + "learning_rate": 0.0002, + "loss": 1.2833, + "step": 7310 + }, + { + "epoch": 5.468808367575645, + "grad_norm": 0.9912874102592468, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 7320 + }, + { + "epoch": 5.476279417258125, + "grad_norm": 1.0727077722549438, + "learning_rate": 0.0002, + "loss": 1.2557, + "step": 7330 + }, + { + "epoch": 5.483750466940605, + "grad_norm": 0.8633865118026733, + "learning_rate": 0.0002, + "loss": 1.3471, + "step": 7340 + }, + { + "epoch": 5.491221516623085, + "grad_norm": 0.9396262764930725, + "learning_rate": 0.0002, + "loss": 1.3155, + "step": 7350 + }, + { + "epoch": 5.498692566305566, + "grad_norm": 1.0253715515136719, + "learning_rate": 0.0002, + "loss": 1.3146, + "step": 7360 + }, + { + "epoch": 5.506163615988046, + "grad_norm": 1.006047010421753, + "learning_rate": 0.0002, + "loss": 1.3156, + "step": 7370 + }, + { + "epoch": 5.513634665670526, + "grad_norm": 0.9781233072280884, + "learning_rate": 0.0002, + "loss": 1.3107, + "step": 7380 + }, + { + "epoch": 5.521105715353007, + "grad_norm": 0.9945126175880432, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 7390 + }, + { + "epoch": 5.528576765035488, + "grad_norm": 0.9081175327301025, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 7400 + }, + { + "epoch": 5.536047814717968, + "grad_norm": 1.2215938568115234, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 7410 + }, + { + "epoch": 5.543518864400449, + "grad_norm": 1.0724077224731445, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 7420 + }, + { + "epoch": 5.550989914082929, + "grad_norm": 1.106955885887146, + "learning_rate": 0.0002, + "loss": 1.3083, + "step": 7430 + }, + { + "epoch": 5.558460963765409, + "grad_norm": 1.0657650232315063, + "learning_rate": 0.0002, + "loss": 1.2125, + "step": 7440 + }, + { + "epoch": 5.565932013447889, + "grad_norm": 0.9725455641746521, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 7450 + }, + { + "epoch": 5.57340306313037, + "grad_norm": 0.8604224324226379, + "learning_rate": 0.0002, + "loss": 1.3297, + "step": 7460 + }, + { + "epoch": 5.58087411281285, + "grad_norm": 0.9913371205329895, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 7470 + }, + { + "epoch": 5.58834516249533, + "grad_norm": 1.012073040008545, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 7480 + }, + { + "epoch": 5.5958162121778106, + "grad_norm": 1.1003159284591675, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 7490 + }, + { + "epoch": 5.603287261860292, + "grad_norm": 0.9104593992233276, + "learning_rate": 0.0002, + "loss": 1.2577, + "step": 7500 + }, + { + "epoch": 5.610758311542772, + "grad_norm": 0.9480831623077393, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 7510 + }, + { + "epoch": 5.618229361225252, + "grad_norm": 1.0826456546783447, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 7520 + }, + { + "epoch": 5.625700410907733, + "grad_norm": 0.8286259174346924, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 7530 + }, + { + "epoch": 5.633171460590213, + "grad_norm": 0.9145061373710632, + "learning_rate": 0.0002, + "loss": 1.2918, + "step": 7540 + }, + { + "epoch": 5.640642510272693, + "grad_norm": 0.9363601803779602, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 7550 + }, + { + "epoch": 5.648113559955174, + "grad_norm": 0.9553244709968567, + "learning_rate": 0.0002, + "loss": 1.2265, + "step": 7560 + }, + { + "epoch": 5.655584609637654, + "grad_norm": 1.0343557596206665, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 7570 + }, + { + "epoch": 5.663055659320134, + "grad_norm": 0.8734238743782043, + "learning_rate": 0.0002, + "loss": 1.3171, + "step": 7580 + }, + { + "epoch": 5.670526709002615, + "grad_norm": 1.0230586528778076, + "learning_rate": 0.0002, + "loss": 1.2785, + "step": 7590 + }, + { + "epoch": 5.677997758685096, + "grad_norm": 1.0063409805297852, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 7600 + }, + { + "epoch": 5.685468808367576, + "grad_norm": 1.0104626417160034, + "learning_rate": 0.0002, + "loss": 1.2396, + "step": 7610 + }, + { + "epoch": 5.692939858050056, + "grad_norm": 0.9528168439865112, + "learning_rate": 0.0002, + "loss": 1.2581, + "step": 7620 + }, + { + "epoch": 5.700410907732536, + "grad_norm": 0.9799878597259521, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 7630 + }, + { + "epoch": 5.707881957415017, + "grad_norm": 0.969351589679718, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 7640 + }, + { + "epoch": 5.715353007097497, + "grad_norm": 1.3037652969360352, + "learning_rate": 0.0002, + "loss": 1.3055, + "step": 7650 + }, + { + "epoch": 5.722824056779977, + "grad_norm": 1.0640486478805542, + "learning_rate": 0.0002, + "loss": 1.3126, + "step": 7660 + }, + { + "epoch": 5.730295106462458, + "grad_norm": 1.0416420698165894, + "learning_rate": 0.0002, + "loss": 1.3325, + "step": 7670 + }, + { + "epoch": 5.7377661561449385, + "grad_norm": 0.8893619775772095, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 7680 + }, + { + "epoch": 5.745237205827419, + "grad_norm": 0.8512844443321228, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 7690 + }, + { + "epoch": 5.7527082555099, + "grad_norm": 0.9955748319625854, + "learning_rate": 0.0002, + "loss": 1.3328, + "step": 7700 + }, + { + "epoch": 5.76017930519238, + "grad_norm": 1.0409910678863525, + "learning_rate": 0.0002, + "loss": 1.294, + "step": 7710 + }, + { + "epoch": 5.76765035487486, + "grad_norm": 1.010097861289978, + "learning_rate": 0.0002, + "loss": 1.3518, + "step": 7720 + }, + { + "epoch": 5.77512140455734, + "grad_norm": 0.8974892497062683, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 7730 + }, + { + "epoch": 5.782592454239821, + "grad_norm": 0.972835123538971, + "learning_rate": 0.0002, + "loss": 1.2743, + "step": 7740 + }, + { + "epoch": 5.790063503922301, + "grad_norm": 0.9607440829277039, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 7750 + }, + { + "epoch": 5.797534553604781, + "grad_norm": 0.9426500797271729, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 7760 + }, + { + "epoch": 5.8050056032872615, + "grad_norm": 0.8745320439338684, + "learning_rate": 0.0002, + "loss": 1.274, + "step": 7770 + }, + { + "epoch": 5.8124766529697425, + "grad_norm": 1.0117204189300537, + "learning_rate": 0.0002, + "loss": 1.3009, + "step": 7780 + }, + { + "epoch": 5.819947702652223, + "grad_norm": 1.0387755632400513, + "learning_rate": 0.0002, + "loss": 1.3135, + "step": 7790 + }, + { + "epoch": 5.827418752334703, + "grad_norm": 1.0709784030914307, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 7800 + }, + { + "epoch": 5.834889802017184, + "grad_norm": 0.9512667655944824, + "learning_rate": 0.0002, + "loss": 1.225, + "step": 7810 + }, + { + "epoch": 5.842360851699664, + "grad_norm": 1.021094560623169, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 7820 + }, + { + "epoch": 5.849831901382144, + "grad_norm": 1.117491364479065, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 7830 + }, + { + "epoch": 5.857302951064625, + "grad_norm": 0.9252554178237915, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 7840 + }, + { + "epoch": 5.864774000747105, + "grad_norm": 1.1416207551956177, + "learning_rate": 0.0002, + "loss": 1.2976, + "step": 7850 + }, + { + "epoch": 5.872245050429585, + "grad_norm": 1.1219907999038696, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 7860 + }, + { + "epoch": 5.8797161001120655, + "grad_norm": 0.8300467729568481, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 7870 + }, + { + "epoch": 5.8871871497945465, + "grad_norm": 1.00551438331604, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 7880 + }, + { + "epoch": 5.894658199477027, + "grad_norm": 0.8981153964996338, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 7890 + }, + { + "epoch": 5.902129249159507, + "grad_norm": 1.0247976779937744, + "learning_rate": 0.0002, + "loss": 1.2817, + "step": 7900 + }, + { + "epoch": 5.909600298841987, + "grad_norm": 1.0820319652557373, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 7910 + }, + { + "epoch": 5.917071348524468, + "grad_norm": 0.952675461769104, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 7920 + }, + { + "epoch": 5.924542398206948, + "grad_norm": 0.8666740655899048, + "learning_rate": 0.0002, + "loss": 1.307, + "step": 7930 + }, + { + "epoch": 5.932013447889428, + "grad_norm": 0.8640421032905579, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 7940 + }, + { + "epoch": 5.939484497571909, + "grad_norm": 1.2343276739120483, + "learning_rate": 0.0002, + "loss": 1.2386, + "step": 7950 + }, + { + "epoch": 5.946955547254389, + "grad_norm": 0.958046555519104, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 7960 + }, + { + "epoch": 5.9544265969368695, + "grad_norm": 1.0538510084152222, + "learning_rate": 0.0002, + "loss": 1.2352, + "step": 7970 + }, + { + "epoch": 5.9618976466193505, + "grad_norm": 1.2681571245193481, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 7980 + }, + { + "epoch": 5.969368696301831, + "grad_norm": 0.8171183466911316, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 7990 + }, + { + "epoch": 5.976839745984311, + "grad_norm": 0.9109523892402649, + "learning_rate": 0.0002, + "loss": 1.3412, + "step": 8000 + }, + { + "epoch": 5.984310795666791, + "grad_norm": 1.0040639638900757, + "learning_rate": 0.0002, + "loss": 1.3497, + "step": 8010 + }, + { + "epoch": 5.991781845349272, + "grad_norm": 0.9596554040908813, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 8020 + }, + { + "epoch": 5.999252895031752, + "grad_norm": 0.9782963991165161, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 8030 + }, + { + "epoch": 6.0, + "eval_loss": 2.0417845249176025, + "eval_runtime": 38.8465, + "eval_samples_per_second": 13.257, + "eval_steps_per_second": 1.673, + "step": 8031 + }, + { + "epoch": 6.006723944714232, + "grad_norm": 1.380823016166687, + "learning_rate": 0.0002, + "loss": 1.0886, + "step": 8040 + }, + { + "epoch": 6.014194994396712, + "grad_norm": 1.067636251449585, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 8050 + }, + { + "epoch": 6.021666044079193, + "grad_norm": 1.363402009010315, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 8060 + }, + { + "epoch": 6.0291370937616735, + "grad_norm": 0.9901054501533508, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 8070 + }, + { + "epoch": 6.036608143444154, + "grad_norm": 1.1545379161834717, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 8080 + }, + { + "epoch": 6.044079193126635, + "grad_norm": 1.2259265184402466, + "learning_rate": 0.0002, + "loss": 1.0644, + "step": 8090 + }, + { + "epoch": 6.051550242809115, + "grad_norm": 1.1237425804138184, + "learning_rate": 0.0002, + "loss": 1.1273, + "step": 8100 + }, + { + "epoch": 6.059021292491595, + "grad_norm": 1.2805622816085815, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 8110 + }, + { + "epoch": 6.066492342174075, + "grad_norm": 1.2270452976226807, + "learning_rate": 0.0002, + "loss": 1.0731, + "step": 8120 + }, + { + "epoch": 6.073963391856556, + "grad_norm": 1.1924101114273071, + "learning_rate": 0.0002, + "loss": 1.0692, + "step": 8130 + }, + { + "epoch": 6.081434441539036, + "grad_norm": 1.2543894052505493, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 8140 + }, + { + "epoch": 6.088905491221516, + "grad_norm": 1.1821149587631226, + "learning_rate": 0.0002, + "loss": 1.069, + "step": 8150 + }, + { + "epoch": 6.096376540903997, + "grad_norm": 1.2202836275100708, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 8160 + }, + { + "epoch": 6.1038475905864775, + "grad_norm": 1.0576019287109375, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 8170 + }, + { + "epoch": 6.111318640268958, + "grad_norm": 1.31708824634552, + "learning_rate": 0.0002, + "loss": 1.1395, + "step": 8180 + }, + { + "epoch": 6.118789689951438, + "grad_norm": 1.0479495525360107, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 8190 + }, + { + "epoch": 6.126260739633919, + "grad_norm": 1.285003423690796, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 8200 + }, + { + "epoch": 6.133731789316399, + "grad_norm": 1.0989165306091309, + "learning_rate": 0.0002, + "loss": 1.0642, + "step": 8210 + }, + { + "epoch": 6.141202838998879, + "grad_norm": 1.1659013032913208, + "learning_rate": 0.0002, + "loss": 1.0981, + "step": 8220 + }, + { + "epoch": 6.14867388868136, + "grad_norm": 1.2796376943588257, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 8230 + }, + { + "epoch": 6.15614493836384, + "grad_norm": 1.060564637184143, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 8240 + }, + { + "epoch": 6.16361598804632, + "grad_norm": 1.3884605169296265, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 8250 + }, + { + "epoch": 6.1710870377288005, + "grad_norm": 1.1570569276809692, + "learning_rate": 0.0002, + "loss": 1.0504, + "step": 8260 + }, + { + "epoch": 6.1785580874112815, + "grad_norm": 1.4136502742767334, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 8270 + }, + { + "epoch": 6.186029137093762, + "grad_norm": 1.3396095037460327, + "learning_rate": 0.0002, + "loss": 1.0882, + "step": 8280 + }, + { + "epoch": 6.193500186776242, + "grad_norm": 1.2549997568130493, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 8290 + }, + { + "epoch": 6.200971236458723, + "grad_norm": 1.3629751205444336, + "learning_rate": 0.0002, + "loss": 1.0626, + "step": 8300 + }, + { + "epoch": 6.208442286141203, + "grad_norm": 1.1029163599014282, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 8310 + }, + { + "epoch": 6.215913335823683, + "grad_norm": 1.1992450952529907, + "learning_rate": 0.0002, + "loss": 1.0895, + "step": 8320 + }, + { + "epoch": 6.223384385506163, + "grad_norm": 1.3317986726760864, + "learning_rate": 0.0002, + "loss": 1.1417, + "step": 8330 + }, + { + "epoch": 6.230855435188644, + "grad_norm": 1.0538336038589478, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 8340 + }, + { + "epoch": 6.238326484871124, + "grad_norm": 1.1767704486846924, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 8350 + }, + { + "epoch": 6.2457975345536045, + "grad_norm": 1.1213016510009766, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 8360 + }, + { + "epoch": 6.253268584236086, + "grad_norm": 1.1895716190338135, + "learning_rate": 0.0002, + "loss": 1.1241, + "step": 8370 + }, + { + "epoch": 6.260739633918566, + "grad_norm": 1.1078153848648071, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 8380 + }, + { + "epoch": 6.268210683601046, + "grad_norm": 1.1662801504135132, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 8390 + }, + { + "epoch": 6.275681733283526, + "grad_norm": 1.2071197032928467, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 8400 + }, + { + "epoch": 6.283152782966007, + "grad_norm": 1.2653778791427612, + "learning_rate": 0.0002, + "loss": 1.0625, + "step": 8410 + }, + { + "epoch": 6.290623832648487, + "grad_norm": 1.6128872632980347, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 8420 + }, + { + "epoch": 6.298094882330967, + "grad_norm": 1.4993070363998413, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 8430 + }, + { + "epoch": 6.305565932013448, + "grad_norm": 1.16339910030365, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 8440 + }, + { + "epoch": 6.313036981695928, + "grad_norm": 1.256822943687439, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 8450 + }, + { + "epoch": 6.3205080313784086, + "grad_norm": 1.1352964639663696, + "learning_rate": 0.0002, + "loss": 1.1566, + "step": 8460 + }, + { + "epoch": 6.327979081060889, + "grad_norm": 1.0061070919036865, + "learning_rate": 0.0002, + "loss": 1.1297, + "step": 8470 + }, + { + "epoch": 6.33545013074337, + "grad_norm": 1.1901768445968628, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 8480 + }, + { + "epoch": 6.34292118042585, + "grad_norm": 1.2715139389038086, + "learning_rate": 0.0002, + "loss": 1.1463, + "step": 8490 + }, + { + "epoch": 6.35039223010833, + "grad_norm": 1.1583346128463745, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 8500 + }, + { + "epoch": 6.357863279790811, + "grad_norm": 1.1427477598190308, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 8510 + }, + { + "epoch": 6.365334329473291, + "grad_norm": 1.1952263116836548, + "learning_rate": 0.0002, + "loss": 1.1119, + "step": 8520 + }, + { + "epoch": 6.372805379155771, + "grad_norm": 1.0599623918533325, + "learning_rate": 0.0002, + "loss": 1.0797, + "step": 8530 + }, + { + "epoch": 6.380276428838251, + "grad_norm": 1.3511574268341064, + "learning_rate": 0.0002, + "loss": 1.1091, + "step": 8540 + }, + { + "epoch": 6.387747478520732, + "grad_norm": 1.171126127243042, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 8550 + }, + { + "epoch": 6.395218528203213, + "grad_norm": 1.285474419593811, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 8560 + }, + { + "epoch": 6.402689577885693, + "grad_norm": 0.9751279950141907, + "learning_rate": 0.0002, + "loss": 1.1505, + "step": 8570 + }, + { + "epoch": 6.410160627568174, + "grad_norm": 1.2194149494171143, + "learning_rate": 0.0002, + "loss": 1.1502, + "step": 8580 + }, + { + "epoch": 6.417631677250654, + "grad_norm": 1.255888819694519, + "learning_rate": 0.0002, + "loss": 1.138, + "step": 8590 + }, + { + "epoch": 6.425102726933134, + "grad_norm": 1.1636122465133667, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 8600 + }, + { + "epoch": 6.432573776615614, + "grad_norm": 1.0769859552383423, + "learning_rate": 0.0002, + "loss": 1.1398, + "step": 8610 + }, + { + "epoch": 6.440044826298095, + "grad_norm": 1.151778221130371, + "learning_rate": 0.0002, + "loss": 1.1183, + "step": 8620 + }, + { + "epoch": 6.447515875980575, + "grad_norm": 1.2749944925308228, + "learning_rate": 0.0002, + "loss": 1.0706, + "step": 8630 + }, + { + "epoch": 6.454986925663055, + "grad_norm": 1.1925828456878662, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 8640 + }, + { + "epoch": 6.4624579753455365, + "grad_norm": 1.166107416152954, + "learning_rate": 0.0002, + "loss": 1.1581, + "step": 8650 + }, + { + "epoch": 6.469929025028017, + "grad_norm": 1.0372248888015747, + "learning_rate": 0.0002, + "loss": 1.105, + "step": 8660 + }, + { + "epoch": 6.477400074710497, + "grad_norm": 1.26933753490448, + "learning_rate": 0.0002, + "loss": 1.1546, + "step": 8670 + }, + { + "epoch": 6.484871124392977, + "grad_norm": 1.2154223918914795, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 8680 + }, + { + "epoch": 6.492342174075458, + "grad_norm": 1.09475839138031, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 8690 + }, + { + "epoch": 6.499813223757938, + "grad_norm": 1.0763037204742432, + "learning_rate": 0.0002, + "loss": 1.1168, + "step": 8700 + }, + { + "epoch": 6.507284273440418, + "grad_norm": 1.1882896423339844, + "learning_rate": 0.0002, + "loss": 1.1993, + "step": 8710 + }, + { + "epoch": 6.514755323122898, + "grad_norm": 1.1662089824676514, + "learning_rate": 0.0002, + "loss": 1.1498, + "step": 8720 + }, + { + "epoch": 6.522226372805379, + "grad_norm": 1.3259495496749878, + "learning_rate": 0.0002, + "loss": 1.2008, + "step": 8730 + }, + { + "epoch": 6.5296974224878594, + "grad_norm": 1.0858017206192017, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 8740 + }, + { + "epoch": 6.53716847217034, + "grad_norm": 1.240337610244751, + "learning_rate": 0.0002, + "loss": 1.1335, + "step": 8750 + }, + { + "epoch": 6.544639521852821, + "grad_norm": 1.1381462812423706, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 8760 + }, + { + "epoch": 6.552110571535301, + "grad_norm": 1.2220063209533691, + "learning_rate": 0.0002, + "loss": 1.0991, + "step": 8770 + }, + { + "epoch": 6.559581621217781, + "grad_norm": 1.1553083658218384, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 8780 + }, + { + "epoch": 6.567052670900262, + "grad_norm": 1.1383219957351685, + "learning_rate": 0.0002, + "loss": 1.0996, + "step": 8790 + }, + { + "epoch": 6.574523720582742, + "grad_norm": 1.0379676818847656, + "learning_rate": 0.0002, + "loss": 1.1355, + "step": 8800 + }, + { + "epoch": 6.581994770265222, + "grad_norm": 1.376488447189331, + "learning_rate": 0.0002, + "loss": 1.1704, + "step": 8810 + }, + { + "epoch": 6.589465819947702, + "grad_norm": 1.1586211919784546, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 8820 + }, + { + "epoch": 6.596936869630183, + "grad_norm": 1.28152334690094, + "learning_rate": 0.0002, + "loss": 1.1904, + "step": 8830 + }, + { + "epoch": 6.6044079193126635, + "grad_norm": 1.2656810283660889, + "learning_rate": 0.0002, + "loss": 1.1646, + "step": 8840 + }, + { + "epoch": 6.611878968995144, + "grad_norm": 1.0636502504348755, + "learning_rate": 0.0002, + "loss": 1.1865, + "step": 8850 + }, + { + "epoch": 6.619350018677624, + "grad_norm": 1.273239254951477, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 8860 + }, + { + "epoch": 6.626821068360105, + "grad_norm": 1.1055482625961304, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 8870 + }, + { + "epoch": 6.634292118042585, + "grad_norm": 1.1934176683425903, + "learning_rate": 0.0002, + "loss": 1.0877, + "step": 8880 + }, + { + "epoch": 6.641763167725065, + "grad_norm": 1.2248114347457886, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 8890 + }, + { + "epoch": 6.649234217407546, + "grad_norm": 1.1950982809066772, + "learning_rate": 0.0002, + "loss": 1.1609, + "step": 8900 + }, + { + "epoch": 6.656705267090026, + "grad_norm": 1.0821784734725952, + "learning_rate": 0.0002, + "loss": 1.169, + "step": 8910 + }, + { + "epoch": 6.664176316772506, + "grad_norm": 1.0062463283538818, + "learning_rate": 0.0002, + "loss": 1.1337, + "step": 8920 + }, + { + "epoch": 6.671647366454987, + "grad_norm": 1.2373089790344238, + "learning_rate": 0.0002, + "loss": 1.1403, + "step": 8930 + }, + { + "epoch": 6.6791184161374675, + "grad_norm": 1.1821746826171875, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 8940 + }, + { + "epoch": 6.686589465819948, + "grad_norm": 1.2350659370422363, + "learning_rate": 0.0002, + "loss": 1.1214, + "step": 8950 + }, + { + "epoch": 6.694060515502428, + "grad_norm": 1.1012883186340332, + "learning_rate": 0.0002, + "loss": 1.225, + "step": 8960 + }, + { + "epoch": 6.701531565184909, + "grad_norm": 1.2008943557739258, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 8970 + }, + { + "epoch": 6.709002614867389, + "grad_norm": 1.2355504035949707, + "learning_rate": 0.0002, + "loss": 1.1769, + "step": 8980 + }, + { + "epoch": 6.716473664549869, + "grad_norm": 1.2367502450942993, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 8990 + }, + { + "epoch": 6.723944714232349, + "grad_norm": 1.1075866222381592, + "learning_rate": 0.0002, + "loss": 1.1235, + "step": 9000 + }, + { + "epoch": 6.73141576391483, + "grad_norm": 1.246480941772461, + "learning_rate": 0.0002, + "loss": 1.1239, + "step": 9010 + }, + { + "epoch": 6.73888681359731, + "grad_norm": 1.1252824068069458, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 9020 + }, + { + "epoch": 6.7463578632797905, + "grad_norm": 1.0706887245178223, + "learning_rate": 0.0002, + "loss": 1.1762, + "step": 9030 + }, + { + "epoch": 6.7538289129622715, + "grad_norm": 1.0874755382537842, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 9040 + }, + { + "epoch": 6.761299962644752, + "grad_norm": 1.121434211730957, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 9050 + }, + { + "epoch": 6.768771012327232, + "grad_norm": 1.1517996788024902, + "learning_rate": 0.0002, + "loss": 1.2018, + "step": 9060 + }, + { + "epoch": 6.776242062009713, + "grad_norm": 1.2484540939331055, + "learning_rate": 0.0002, + "loss": 1.1593, + "step": 9070 + }, + { + "epoch": 6.783713111692193, + "grad_norm": 1.023059368133545, + "learning_rate": 0.0002, + "loss": 1.13, + "step": 9080 + }, + { + "epoch": 6.791184161374673, + "grad_norm": 1.1334631443023682, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 9090 + }, + { + "epoch": 6.798655211057153, + "grad_norm": 1.2991816997528076, + "learning_rate": 0.0002, + "loss": 1.18, + "step": 9100 + }, + { + "epoch": 6.806126260739634, + "grad_norm": 1.4147199392318726, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 9110 + }, + { + "epoch": 6.813597310422114, + "grad_norm": 1.1353832483291626, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 9120 + }, + { + "epoch": 6.8210683601045945, + "grad_norm": 1.0332539081573486, + "learning_rate": 0.0002, + "loss": 1.1379, + "step": 9130 + }, + { + "epoch": 6.828539409787075, + "grad_norm": 1.2208142280578613, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 9140 + }, + { + "epoch": 6.836010459469556, + "grad_norm": 1.3033398389816284, + "learning_rate": 0.0002, + "loss": 1.1463, + "step": 9150 + }, + { + "epoch": 6.843481509152036, + "grad_norm": 1.2676737308502197, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 9160 + }, + { + "epoch": 6.850952558834516, + "grad_norm": 1.1668603420257568, + "learning_rate": 0.0002, + "loss": 1.1786, + "step": 9170 + }, + { + "epoch": 6.858423608516997, + "grad_norm": 1.1994788646697998, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 9180 + }, + { + "epoch": 6.865894658199477, + "grad_norm": 1.231873869895935, + "learning_rate": 0.0002, + "loss": 1.2131, + "step": 9190 + }, + { + "epoch": 6.873365707881957, + "grad_norm": 0.9981484413146973, + "learning_rate": 0.0002, + "loss": 1.2109, + "step": 9200 + }, + { + "epoch": 6.880836757564438, + "grad_norm": 1.2799428701400757, + "learning_rate": 0.0002, + "loss": 1.1084, + "step": 9210 + }, + { + "epoch": 6.888307807246918, + "grad_norm": 1.2042057514190674, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 9220 + }, + { + "epoch": 6.8957788569293985, + "grad_norm": 1.070420265197754, + "learning_rate": 0.0002, + "loss": 1.1567, + "step": 9230 + }, + { + "epoch": 6.903249906611879, + "grad_norm": 1.327160358428955, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 9240 + }, + { + "epoch": 6.91072095629436, + "grad_norm": 1.1109007596969604, + "learning_rate": 0.0002, + "loss": 1.1945, + "step": 9250 + }, + { + "epoch": 6.91819200597684, + "grad_norm": 1.1669930219650269, + "learning_rate": 0.0002, + "loss": 1.1701, + "step": 9260 + }, + { + "epoch": 6.92566305565932, + "grad_norm": 1.034532904624939, + "learning_rate": 0.0002, + "loss": 1.1854, + "step": 9270 + }, + { + "epoch": 6.9331341053418, + "grad_norm": 1.1035540103912354, + "learning_rate": 0.0002, + "loss": 1.1712, + "step": 9280 + }, + { + "epoch": 6.940605155024281, + "grad_norm": 1.366254448890686, + "learning_rate": 0.0002, + "loss": 1.1767, + "step": 9290 + }, + { + "epoch": 6.948076204706761, + "grad_norm": 1.094214677810669, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 9300 + }, + { + "epoch": 6.955547254389241, + "grad_norm": 1.131238579750061, + "learning_rate": 0.0002, + "loss": 1.18, + "step": 9310 + }, + { + "epoch": 6.963018304071722, + "grad_norm": 1.202369213104248, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 9320 + }, + { + "epoch": 6.9704893537542025, + "grad_norm": 1.1067225933074951, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 9330 + }, + { + "epoch": 6.977960403436683, + "grad_norm": 1.0258643627166748, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 9340 + }, + { + "epoch": 6.985431453119164, + "grad_norm": 1.3311655521392822, + "learning_rate": 0.0002, + "loss": 1.2053, + "step": 9350 + }, + { + "epoch": 6.992902502801644, + "grad_norm": 1.1245559453964233, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 9360 + }, + { + "epoch": 6.999626447515876, + "eval_loss": 2.128103017807007, + "eval_runtime": 39.1339, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.661, + "step": 9369 + }, + { + "epoch": 7.000373552484124, + "grad_norm": 1.0868251323699951, + "learning_rate": 0.0002, + "loss": 1.1782, + "step": 9370 + }, + { + "epoch": 7.007844602166604, + "grad_norm": 1.5252128839492798, + "learning_rate": 0.0002, + "loss": 1.0133, + "step": 9380 + }, + { + "epoch": 7.015315651849085, + "grad_norm": 1.1230034828186035, + "learning_rate": 0.0002, + "loss": 0.9364, + "step": 9390 + }, + { + "epoch": 7.022786701531565, + "grad_norm": 1.275871992111206, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 9400 + }, + { + "epoch": 7.030257751214045, + "grad_norm": 1.462963342666626, + "learning_rate": 0.0002, + "loss": 0.9305, + "step": 9410 + }, + { + "epoch": 7.0377288008965255, + "grad_norm": 1.0506054162979126, + "learning_rate": 0.0002, + "loss": 0.9329, + "step": 9420 + }, + { + "epoch": 7.0451998505790066, + "grad_norm": 1.4315128326416016, + "learning_rate": 0.0002, + "loss": 0.9398, + "step": 9430 + }, + { + "epoch": 7.052670900261487, + "grad_norm": 1.5143473148345947, + "learning_rate": 0.0002, + "loss": 0.9086, + "step": 9440 + }, + { + "epoch": 7.060141949943967, + "grad_norm": 1.2537293434143066, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 9450 + }, + { + "epoch": 7.067612999626448, + "grad_norm": 1.36807382106781, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 9460 + }, + { + "epoch": 7.075084049308928, + "grad_norm": 1.5365028381347656, + "learning_rate": 0.0002, + "loss": 1.0046, + "step": 9470 + }, + { + "epoch": 7.082555098991408, + "grad_norm": 1.227250576019287, + "learning_rate": 0.0002, + "loss": 1.0045, + "step": 9480 + }, + { + "epoch": 7.090026148673888, + "grad_norm": 1.6941372156143188, + "learning_rate": 0.0002, + "loss": 0.9745, + "step": 9490 + }, + { + "epoch": 7.097497198356369, + "grad_norm": 1.587410569190979, + "learning_rate": 0.0002, + "loss": 0.9203, + "step": 9500 + }, + { + "epoch": 7.104968248038849, + "grad_norm": 1.481272578239441, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 9510 + }, + { + "epoch": 7.1124392977213295, + "grad_norm": 1.2331953048706055, + "learning_rate": 0.0002, + "loss": 1.0066, + "step": 9520 + }, + { + "epoch": 7.119910347403811, + "grad_norm": 1.6446775197982788, + "learning_rate": 0.0002, + "loss": 0.9422, + "step": 9530 + }, + { + "epoch": 7.127381397086291, + "grad_norm": 1.2055929899215698, + "learning_rate": 0.0002, + "loss": 0.901, + "step": 9540 + }, + { + "epoch": 7.134852446768771, + "grad_norm": 1.119033932685852, + "learning_rate": 0.0002, + "loss": 0.8959, + "step": 9550 + }, + { + "epoch": 7.142323496451251, + "grad_norm": 1.712833046913147, + "learning_rate": 0.0002, + "loss": 0.9586, + "step": 9560 + }, + { + "epoch": 7.149794546133732, + "grad_norm": 1.2007980346679688, + "learning_rate": 0.0002, + "loss": 0.9326, + "step": 9570 + }, + { + "epoch": 7.157265595816212, + "grad_norm": 1.3251731395721436, + "learning_rate": 0.0002, + "loss": 0.9926, + "step": 9580 + }, + { + "epoch": 7.164736645498692, + "grad_norm": 1.1897934675216675, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 9590 + }, + { + "epoch": 7.172207695181173, + "grad_norm": 1.348583698272705, + "learning_rate": 0.0002, + "loss": 0.9401, + "step": 9600 + }, + { + "epoch": 7.179678744863653, + "grad_norm": 1.1588937044143677, + "learning_rate": 0.0002, + "loss": 0.9931, + "step": 9610 + }, + { + "epoch": 7.187149794546134, + "grad_norm": 1.3808276653289795, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 9620 + }, + { + "epoch": 7.194620844228614, + "grad_norm": 1.552425503730774, + "learning_rate": 0.0002, + "loss": 0.9955, + "step": 9630 + }, + { + "epoch": 7.202091893911095, + "grad_norm": 1.3649828433990479, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 9640 + }, + { + "epoch": 7.209562943593575, + "grad_norm": 1.3196533918380737, + "learning_rate": 0.0002, + "loss": 0.973, + "step": 9650 + }, + { + "epoch": 7.217033993276055, + "grad_norm": 1.4874017238616943, + "learning_rate": 0.0002, + "loss": 1.0119, + "step": 9660 + }, + { + "epoch": 7.224505042958536, + "grad_norm": 1.2448325157165527, + "learning_rate": 0.0002, + "loss": 0.9778, + "step": 9670 + }, + { + "epoch": 7.231976092641016, + "grad_norm": 1.4631818532943726, + "learning_rate": 0.0002, + "loss": 0.9697, + "step": 9680 + }, + { + "epoch": 7.239447142323496, + "grad_norm": 1.2041361331939697, + "learning_rate": 0.0002, + "loss": 0.9827, + "step": 9690 + }, + { + "epoch": 7.246918192005976, + "grad_norm": 1.559156060218811, + "learning_rate": 0.0002, + "loss": 0.9417, + "step": 9700 + }, + { + "epoch": 7.2543892416884574, + "grad_norm": 1.3939464092254639, + "learning_rate": 0.0002, + "loss": 1.0232, + "step": 9710 + }, + { + "epoch": 7.261860291370938, + "grad_norm": 1.347583293914795, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 9720 + }, + { + "epoch": 7.269331341053418, + "grad_norm": 1.302850365638733, + "learning_rate": 0.0002, + "loss": 0.9256, + "step": 9730 + }, + { + "epoch": 7.276802390735899, + "grad_norm": 1.1425062417984009, + "learning_rate": 0.0002, + "loss": 0.956, + "step": 9740 + }, + { + "epoch": 7.284273440418379, + "grad_norm": 1.2865869998931885, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 9750 + }, + { + "epoch": 7.291744490100859, + "grad_norm": 1.3773187398910522, + "learning_rate": 0.0002, + "loss": 0.9841, + "step": 9760 + }, + { + "epoch": 7.299215539783339, + "grad_norm": 1.2692701816558838, + "learning_rate": 0.0002, + "loss": 1.0063, + "step": 9770 + }, + { + "epoch": 7.30668658946582, + "grad_norm": 1.38542902469635, + "learning_rate": 0.0002, + "loss": 1.0347, + "step": 9780 + }, + { + "epoch": 7.3141576391483, + "grad_norm": 1.2204844951629639, + "learning_rate": 0.0002, + "loss": 0.9606, + "step": 9790 + }, + { + "epoch": 7.32162868883078, + "grad_norm": 1.4863795042037964, + "learning_rate": 0.0002, + "loss": 1.0225, + "step": 9800 + }, + { + "epoch": 7.3290997385132615, + "grad_norm": 1.2458586692810059, + "learning_rate": 0.0002, + "loss": 0.9933, + "step": 9810 + }, + { + "epoch": 7.336570788195742, + "grad_norm": 1.3530622720718384, + "learning_rate": 0.0002, + "loss": 1.0336, + "step": 9820 + }, + { + "epoch": 7.344041837878222, + "grad_norm": 1.2571991682052612, + "learning_rate": 0.0002, + "loss": 0.9319, + "step": 9830 + }, + { + "epoch": 7.351512887560702, + "grad_norm": 1.3074439764022827, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 9840 + }, + { + "epoch": 7.358983937243183, + "grad_norm": 1.2986950874328613, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 9850 + }, + { + "epoch": 7.366454986925663, + "grad_norm": 1.4233403205871582, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 9860 + }, + { + "epoch": 7.373926036608143, + "grad_norm": 1.468161702156067, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 9870 + }, + { + "epoch": 7.381397086290624, + "grad_norm": 1.354690432548523, + "learning_rate": 0.0002, + "loss": 1.1074, + "step": 9880 + }, + { + "epoch": 7.388868135973104, + "grad_norm": 1.4891324043273926, + "learning_rate": 0.0002, + "loss": 1.0153, + "step": 9890 + }, + { + "epoch": 7.3963391856555845, + "grad_norm": 1.3470090627670288, + "learning_rate": 0.0002, + "loss": 1.0234, + "step": 9900 + }, + { + "epoch": 7.403810235338065, + "grad_norm": 1.373061180114746, + "learning_rate": 0.0002, + "loss": 1.063, + "step": 9910 + }, + { + "epoch": 7.411281285020546, + "grad_norm": 1.4181641340255737, + "learning_rate": 0.0002, + "loss": 1.0109, + "step": 9920 + }, + { + "epoch": 7.418752334703026, + "grad_norm": 1.3284671306610107, + "learning_rate": 0.0002, + "loss": 0.9801, + "step": 9930 + }, + { + "epoch": 7.426223384385506, + "grad_norm": 1.333896517753601, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 9940 + }, + { + "epoch": 7.433694434067987, + "grad_norm": 1.6348158121109009, + "learning_rate": 0.0002, + "loss": 1.0056, + "step": 9950 + }, + { + "epoch": 7.441165483750467, + "grad_norm": 1.364643931388855, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 9960 + }, + { + "epoch": 7.448636533432947, + "grad_norm": 1.3974874019622803, + "learning_rate": 0.0002, + "loss": 1.0076, + "step": 9970 + }, + { + "epoch": 7.456107583115427, + "grad_norm": 1.5207233428955078, + "learning_rate": 0.0002, + "loss": 0.9918, + "step": 9980 + }, + { + "epoch": 7.463578632797908, + "grad_norm": 1.541517734527588, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 9990 + }, + { + "epoch": 7.4710496824803885, + "grad_norm": 1.3563939332962036, + "learning_rate": 0.0002, + "loss": 0.9904, + "step": 10000 + }, + { + "epoch": 7.478520732162869, + "grad_norm": 1.3443987369537354, + "learning_rate": 0.0002, + "loss": 1.0285, + "step": 10010 + }, + { + "epoch": 7.48599178184535, + "grad_norm": 1.2904508113861084, + "learning_rate": 0.0002, + "loss": 1.0028, + "step": 10020 + }, + { + "epoch": 7.49346283152783, + "grad_norm": 1.434145450592041, + "learning_rate": 0.0002, + "loss": 0.9949, + "step": 10030 + }, + { + "epoch": 7.50093388121031, + "grad_norm": 1.4659384489059448, + "learning_rate": 0.0002, + "loss": 1.0837, + "step": 10040 + }, + { + "epoch": 7.508404930892791, + "grad_norm": 1.3430006504058838, + "learning_rate": 0.0002, + "loss": 1.0063, + "step": 10050 + }, + { + "epoch": 7.515875980575271, + "grad_norm": 1.3595343828201294, + "learning_rate": 0.0002, + "loss": 1.0168, + "step": 10060 + }, + { + "epoch": 7.523347030257751, + "grad_norm": 1.7456434965133667, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 10070 + }, + { + "epoch": 7.530818079940231, + "grad_norm": 1.329853892326355, + "learning_rate": 0.0002, + "loss": 1.0843, + "step": 10080 + }, + { + "epoch": 7.538289129622712, + "grad_norm": 1.548466682434082, + "learning_rate": 0.0002, + "loss": 1.0879, + "step": 10090 + }, + { + "epoch": 7.5457601793051925, + "grad_norm": 1.2951644659042358, + "learning_rate": 0.0002, + "loss": 1.0221, + "step": 10100 + }, + { + "epoch": 7.553231228987673, + "grad_norm": 1.3988010883331299, + "learning_rate": 0.0002, + "loss": 1.007, + "step": 10110 + }, + { + "epoch": 7.560702278670153, + "grad_norm": 1.211068868637085, + "learning_rate": 0.0002, + "loss": 1.0353, + "step": 10120 + }, + { + "epoch": 7.568173328352634, + "grad_norm": 1.2159098386764526, + "learning_rate": 0.0002, + "loss": 0.9962, + "step": 10130 + }, + { + "epoch": 7.575644378035114, + "grad_norm": 1.3533744812011719, + "learning_rate": 0.0002, + "loss": 0.9928, + "step": 10140 + }, + { + "epoch": 7.583115427717594, + "grad_norm": 1.3153362274169922, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 10150 + }, + { + "epoch": 7.590586477400075, + "grad_norm": 1.535762906074524, + "learning_rate": 0.0002, + "loss": 1.0017, + "step": 10160 + }, + { + "epoch": 7.598057527082555, + "grad_norm": 1.5531504154205322, + "learning_rate": 0.0002, + "loss": 1.0592, + "step": 10170 + }, + { + "epoch": 7.605528576765035, + "grad_norm": 1.3588606119155884, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 10180 + }, + { + "epoch": 7.6129996264475155, + "grad_norm": 1.6648331880569458, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 10190 + }, + { + "epoch": 7.6204706761299965, + "grad_norm": 1.250205159187317, + "learning_rate": 0.0002, + "loss": 1.04, + "step": 10200 + }, + { + "epoch": 7.627941725812477, + "grad_norm": 1.2442443370819092, + "learning_rate": 0.0002, + "loss": 1.0188, + "step": 10210 + }, + { + "epoch": 7.635412775494957, + "grad_norm": 1.386197805404663, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 10220 + }, + { + "epoch": 7.642883825177438, + "grad_norm": 1.3478381633758545, + "learning_rate": 0.0002, + "loss": 1.0233, + "step": 10230 + }, + { + "epoch": 7.650354874859918, + "grad_norm": 1.2800627946853638, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 10240 + }, + { + "epoch": 7.657825924542398, + "grad_norm": 1.4082499742507935, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 10250 + }, + { + "epoch": 7.665296974224878, + "grad_norm": 1.321529746055603, + "learning_rate": 0.0002, + "loss": 1.0462, + "step": 10260 + }, + { + "epoch": 7.672768023907359, + "grad_norm": 1.4213372468948364, + "learning_rate": 0.0002, + "loss": 1.0727, + "step": 10270 + }, + { + "epoch": 7.680239073589839, + "grad_norm": 1.5585565567016602, + "learning_rate": 0.0002, + "loss": 0.9938, + "step": 10280 + }, + { + "epoch": 7.6877101232723195, + "grad_norm": 1.4025108814239502, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 10290 + }, + { + "epoch": 7.6951811729548005, + "grad_norm": 1.344456434249878, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 10300 + }, + { + "epoch": 7.702652222637281, + "grad_norm": 1.4962990283966064, + "learning_rate": 0.0002, + "loss": 1.0157, + "step": 10310 + }, + { + "epoch": 7.710123272319761, + "grad_norm": 1.4523862600326538, + "learning_rate": 0.0002, + "loss": 1.0621, + "step": 10320 + }, + { + "epoch": 7.717594322002241, + "grad_norm": 1.401842474937439, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 10330 + }, + { + "epoch": 7.725065371684722, + "grad_norm": 1.517730474472046, + "learning_rate": 0.0002, + "loss": 1.0028, + "step": 10340 + }, + { + "epoch": 7.732536421367202, + "grad_norm": 1.3876111507415771, + "learning_rate": 0.0002, + "loss": 1.0061, + "step": 10350 + }, + { + "epoch": 7.740007471049682, + "grad_norm": 1.5741353034973145, + "learning_rate": 0.0002, + "loss": 1.0071, + "step": 10360 + }, + { + "epoch": 7.747478520732163, + "grad_norm": 1.3465591669082642, + "learning_rate": 0.0002, + "loss": 1.0472, + "step": 10370 + }, + { + "epoch": 7.754949570414643, + "grad_norm": 1.3611412048339844, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 10380 + }, + { + "epoch": 7.7624206200971235, + "grad_norm": 1.693565011024475, + "learning_rate": 0.0002, + "loss": 1.0118, + "step": 10390 + }, + { + "epoch": 7.769891669779604, + "grad_norm": 1.4654128551483154, + "learning_rate": 0.0002, + "loss": 1.0981, + "step": 10400 + }, + { + "epoch": 7.777362719462085, + "grad_norm": 1.417768955230713, + "learning_rate": 0.0002, + "loss": 1.029, + "step": 10410 + }, + { + "epoch": 7.784833769144565, + "grad_norm": 1.3143322467803955, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 10420 + }, + { + "epoch": 7.792304818827045, + "grad_norm": 1.3467497825622559, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 10430 + }, + { + "epoch": 7.799775868509526, + "grad_norm": 1.223697543144226, + "learning_rate": 0.0002, + "loss": 1.0555, + "step": 10440 + }, + { + "epoch": 7.807246918192006, + "grad_norm": 1.3060917854309082, + "learning_rate": 0.0002, + "loss": 1.0198, + "step": 10450 + }, + { + "epoch": 7.814717967874486, + "grad_norm": 1.5561134815216064, + "learning_rate": 0.0002, + "loss": 1.0896, + "step": 10460 + }, + { + "epoch": 7.822189017556966, + "grad_norm": 1.2789647579193115, + "learning_rate": 0.0002, + "loss": 1.0981, + "step": 10470 + }, + { + "epoch": 7.829660067239447, + "grad_norm": 1.2422796487808228, + "learning_rate": 0.0002, + "loss": 1.0549, + "step": 10480 + }, + { + "epoch": 7.8371311169219275, + "grad_norm": 1.377565622329712, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 10490 + }, + { + "epoch": 7.844602166604408, + "grad_norm": 1.2221037149429321, + "learning_rate": 0.0002, + "loss": 1.0864, + "step": 10500 + }, + { + "epoch": 7.852073216286888, + "grad_norm": 1.3779186010360718, + "learning_rate": 0.0002, + "loss": 1.0944, + "step": 10510 + }, + { + "epoch": 7.859544265969369, + "grad_norm": 1.3062539100646973, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 10520 + }, + { + "epoch": 7.867015315651849, + "grad_norm": 1.4066052436828613, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 10530 + }, + { + "epoch": 7.874486365334329, + "grad_norm": 1.6326613426208496, + "learning_rate": 0.0002, + "loss": 1.0674, + "step": 10540 + }, + { + "epoch": 7.88195741501681, + "grad_norm": 1.1732137203216553, + "learning_rate": 0.0002, + "loss": 1.1237, + "step": 10550 + }, + { + "epoch": 7.88942846469929, + "grad_norm": 1.303125023841858, + "learning_rate": 0.0002, + "loss": 1.0682, + "step": 10560 + }, + { + "epoch": 7.89689951438177, + "grad_norm": 1.294990062713623, + "learning_rate": 0.0002, + "loss": 1.0524, + "step": 10570 + }, + { + "epoch": 7.904370564064251, + "grad_norm": 1.4719983339309692, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 10580 + }, + { + "epoch": 7.911841613746732, + "grad_norm": 1.4117742776870728, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 10590 + }, + { + "epoch": 7.919312663429212, + "grad_norm": 1.384812355041504, + "learning_rate": 0.0002, + "loss": 1.1129, + "step": 10600 + }, + { + "epoch": 7.926783713111692, + "grad_norm": 1.5743740797042847, + "learning_rate": 0.0002, + "loss": 0.9994, + "step": 10610 + }, + { + "epoch": 7.934254762794173, + "grad_norm": 1.2799863815307617, + "learning_rate": 0.0002, + "loss": 1.0168, + "step": 10620 + }, + { + "epoch": 7.941725812476653, + "grad_norm": 1.4822591543197632, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 10630 + }, + { + "epoch": 7.949196862159133, + "grad_norm": 1.4634777307510376, + "learning_rate": 0.0002, + "loss": 1.0362, + "step": 10640 + }, + { + "epoch": 7.956667911841613, + "grad_norm": 1.5230964422225952, + "learning_rate": 0.0002, + "loss": 1.0116, + "step": 10650 + }, + { + "epoch": 7.964138961524094, + "grad_norm": 1.3622701168060303, + "learning_rate": 0.0002, + "loss": 1.0582, + "step": 10660 + }, + { + "epoch": 7.971610011206574, + "grad_norm": 1.2133928537368774, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 10670 + }, + { + "epoch": 7.9790810608890546, + "grad_norm": 1.2852206230163574, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 10680 + }, + { + "epoch": 7.986552110571536, + "grad_norm": 1.243310570716858, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 10690 + }, + { + "epoch": 7.994023160254016, + "grad_norm": 1.459757924079895, + "learning_rate": 0.0002, + "loss": 1.053, + "step": 10700 + }, + { + "epoch": 7.997011580127008, + "eval_loss": 2.2722349166870117, + "eval_runtime": 38.9945, + "eval_samples_per_second": 13.207, + "eval_steps_per_second": 1.667, + "step": 10704 + } + ], + "logging_steps": 10, + "max_steps": 10704, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.9535715277327565e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-10704/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6faa882079d4ad3a446600d712cd5c40781e2203 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ea4fc35757651c7c3a6bde3cbe3a29f159b4d024130edde539538989c54f675 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..95d92dee815ea20b248982d95f547f4cda7984a8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4d3550f9d0c229652556a63f6a1db40a586e6c5c4c6a62b3dda5721a65c6553 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d9a9abb6f674fd4d3a5911d973b34ab7cb5369a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4da1ccefac00163faaeff70f727c5f6340a85466e9e936f079a69b938730bb96 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ec9a23aaf79117a31b48df9967f21e29cdce898 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:392d8ed6255678b167709c169ac1b7830ba060af79331ac91cf29c25699c5e20 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..95d5b113fb7382447effb87ae81d204039519304 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/trainer_state.json @@ -0,0 +1,972 @@ +{ + "best_metric": 1.8051470518112183, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338", + "epoch": 0.9996264475158759, + "eval_steps": 10, + "global_step": 1338, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007471049682480389, + "grad_norm": 0.4912872612476349, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 10 + }, + { + "epoch": 0.014942099364960777, + "grad_norm": 0.4856316149234772, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 20 + }, + { + "epoch": 0.022413149047441166, + "grad_norm": 0.47683125734329224, + "learning_rate": 0.0002, + "loss": 2.0957, + "step": 30 + }, + { + "epoch": 0.029884198729921554, + "grad_norm": 0.515082597732544, + "learning_rate": 0.0002, + "loss": 1.8908, + "step": 40 + }, + { + "epoch": 0.03735524841240194, + "grad_norm": 0.5299215316772461, + "learning_rate": 0.0002, + "loss": 1.9704, + "step": 50 + }, + { + "epoch": 0.04482629809488233, + "grad_norm": 0.4951399862766266, + "learning_rate": 0.0002, + "loss": 1.9225, + "step": 60 + }, + { + "epoch": 0.05229734777736272, + "grad_norm": 0.48079821467399597, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05976839745984311, + "grad_norm": 0.49402132630348206, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 80 + }, + { + "epoch": 0.0672394471423235, + "grad_norm": 0.4778193235397339, + "learning_rate": 0.0002, + "loss": 1.8691, + "step": 90 + }, + { + "epoch": 0.07471049682480388, + "grad_norm": 0.42472657561302185, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 100 + }, + { + "epoch": 0.08218154650728428, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0002, + "loss": 1.8744, + "step": 110 + }, + { + "epoch": 0.08965259618976466, + "grad_norm": 0.4472862780094147, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 120 + }, + { + "epoch": 0.09712364587224505, + "grad_norm": 0.42596298456192017, + "learning_rate": 0.0002, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.10459469555472543, + "grad_norm": 0.46645811200141907, + "learning_rate": 0.0002, + "loss": 1.8015, + "step": 140 + }, + { + "epoch": 0.11206574523720583, + "grad_norm": 0.41041234135627747, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 150 + }, + { + "epoch": 0.11953679491968622, + "grad_norm": 0.5329819917678833, + "learning_rate": 0.0002, + "loss": 1.8276, + "step": 160 + }, + { + "epoch": 0.1270078446021666, + "grad_norm": 0.4065922200679779, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 170 + }, + { + "epoch": 0.134478894284647, + "grad_norm": 0.38406994938850403, + "learning_rate": 0.0002, + "loss": 1.8559, + "step": 180 + }, + { + "epoch": 0.14194994396712737, + "grad_norm": 0.4246881306171417, + "learning_rate": 0.0002, + "loss": 1.8647, + "step": 190 + }, + { + "epoch": 0.14942099364960776, + "grad_norm": 0.35136649012565613, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 200 + }, + { + "epoch": 0.15689204333208817, + "grad_norm": 0.43252742290496826, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.16436309301456856, + "grad_norm": 0.39236941933631897, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 220 + }, + { + "epoch": 0.17183414269704894, + "grad_norm": 0.3748249113559723, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 230 + }, + { + "epoch": 0.17930519237952933, + "grad_norm": 0.6432855725288391, + "learning_rate": 0.0002, + "loss": 1.866, + "step": 240 + }, + { + "epoch": 0.1867762420620097, + "grad_norm": 0.34874802827835083, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 250 + }, + { + "epoch": 0.1942472917444901, + "grad_norm": 0.3721984326839447, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 260 + }, + { + "epoch": 0.20171834142697048, + "grad_norm": 0.4339311420917511, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 270 + }, + { + "epoch": 0.20918939110945087, + "grad_norm": 0.4018215537071228, + "learning_rate": 0.0002, + "loss": 1.8665, + "step": 280 + }, + { + "epoch": 0.21666044079193125, + "grad_norm": 0.3278839886188507, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 290 + }, + { + "epoch": 0.22413149047441167, + "grad_norm": 0.36146077513694763, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 300 + }, + { + "epoch": 0.23160254015689205, + "grad_norm": 0.38175010681152344, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 310 + }, + { + "epoch": 0.23907358983937244, + "grad_norm": 0.44776618480682373, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.24654463952185282, + "grad_norm": 0.3933652937412262, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 330 + }, + { + "epoch": 0.2540156892043332, + "grad_norm": 0.3515005111694336, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 340 + }, + { + "epoch": 0.2614867388868136, + "grad_norm": 0.6683304309844971, + "learning_rate": 0.0002, + "loss": 1.8653, + "step": 350 + }, + { + "epoch": 0.268957788569294, + "grad_norm": 0.37093454599380493, + "learning_rate": 0.0002, + "loss": 1.8797, + "step": 360 + }, + { + "epoch": 0.2764288382517744, + "grad_norm": 0.3450651168823242, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 370 + }, + { + "epoch": 0.28389988793425475, + "grad_norm": 0.5140917301177979, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 380 + }, + { + "epoch": 0.29137093761673516, + "grad_norm": 0.32885563373565674, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 390 + }, + { + "epoch": 0.2988419872992155, + "grad_norm": 0.33962297439575195, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.30631303698169593, + "grad_norm": 0.3723141849040985, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 410 + }, + { + "epoch": 0.31378408666417634, + "grad_norm": 0.37173134088516235, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 420 + }, + { + "epoch": 0.3212551363466567, + "grad_norm": 0.33736956119537354, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 430 + }, + { + "epoch": 0.3287261860291371, + "grad_norm": 0.3602448105812073, + "learning_rate": 0.0002, + "loss": 1.8367, + "step": 440 + }, + { + "epoch": 0.33619723571161747, + "grad_norm": 0.3569699227809906, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 450 + }, + { + "epoch": 0.3436682853940979, + "grad_norm": 0.31009167432785034, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 460 + }, + { + "epoch": 0.35113933507657824, + "grad_norm": 0.5278693437576294, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 470 + }, + { + "epoch": 0.35861038475905865, + "grad_norm": 0.3587537109851837, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 480 + }, + { + "epoch": 0.366081434441539, + "grad_norm": 0.3859670162200928, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 490 + }, + { + "epoch": 0.3735524841240194, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 500 + }, + { + "epoch": 0.38102353380649984, + "grad_norm": 0.35052940249443054, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 510 + }, + { + "epoch": 0.3884945834889802, + "grad_norm": 0.2979494333267212, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 520 + }, + { + "epoch": 0.3959656331714606, + "grad_norm": 0.3062683343887329, + "learning_rate": 0.0002, + "loss": 1.8641, + "step": 530 + }, + { + "epoch": 0.40343668285394096, + "grad_norm": 0.3172847330570221, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 540 + }, + { + "epoch": 0.4109077325364214, + "grad_norm": 0.360435426235199, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 550 + }, + { + "epoch": 0.41837878221890173, + "grad_norm": 0.3427872359752655, + "learning_rate": 0.0002, + "loss": 1.9054, + "step": 560 + }, + { + "epoch": 0.42584983190138215, + "grad_norm": 0.34036558866500854, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 570 + }, + { + "epoch": 0.4333208815838625, + "grad_norm": 0.3365345299243927, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 580 + }, + { + "epoch": 0.4407919312663429, + "grad_norm": 0.35619041323661804, + "learning_rate": 0.0002, + "loss": 1.8328, + "step": 590 + }, + { + "epoch": 0.44826298094882333, + "grad_norm": 0.3569088280200958, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 600 + }, + { + "epoch": 0.4557340306313037, + "grad_norm": 0.3581278622150421, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 610 + }, + { + "epoch": 0.4632050803137841, + "grad_norm": 0.43197110295295715, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 620 + }, + { + "epoch": 0.47067612999626446, + "grad_norm": 0.33966198563575745, + "learning_rate": 0.0002, + "loss": 1.8257, + "step": 630 + }, + { + "epoch": 0.47814717967874487, + "grad_norm": 0.3343866467475891, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 640 + }, + { + "epoch": 0.48561822936122523, + "grad_norm": 0.33878564834594727, + "learning_rate": 0.0002, + "loss": 1.8191, + "step": 650 + }, + { + "epoch": 0.49308927904370564, + "grad_norm": 0.387195885181427, + "learning_rate": 0.0002, + "loss": 1.8801, + "step": 660 + }, + { + "epoch": 0.500560328726186, + "grad_norm": 0.3755440413951874, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 670 + }, + { + "epoch": 0.5080313784086664, + "grad_norm": 0.3272816836833954, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 680 + }, + { + "epoch": 0.5155024280911468, + "grad_norm": 0.36063864827156067, + "learning_rate": 0.0002, + "loss": 1.8156, + "step": 690 + }, + { + "epoch": 0.5229734777736272, + "grad_norm": 0.35317373275756836, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 700 + }, + { + "epoch": 0.5304445274561076, + "grad_norm": 0.3561195433139801, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 710 + }, + { + "epoch": 0.537915577138588, + "grad_norm": 0.31124624609947205, + "learning_rate": 0.0002, + "loss": 1.8149, + "step": 720 + }, + { + "epoch": 0.5453866268210683, + "grad_norm": 0.3294544517993927, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 730 + }, + { + "epoch": 0.5528576765035488, + "grad_norm": 0.31933900713920593, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 740 + }, + { + "epoch": 0.5603287261860291, + "grad_norm": 0.3226020634174347, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 750 + }, + { + "epoch": 0.5677997758685095, + "grad_norm": 0.3147525489330292, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 760 + }, + { + "epoch": 0.57527082555099, + "grad_norm": 0.32234328985214233, + "learning_rate": 0.0002, + "loss": 1.9028, + "step": 770 + }, + { + "epoch": 0.5827418752334703, + "grad_norm": 0.3258664309978485, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 780 + }, + { + "epoch": 0.5902129249159507, + "grad_norm": 0.3166961967945099, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 790 + }, + { + "epoch": 0.597683974598431, + "grad_norm": 0.35621458292007446, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 800 + }, + { + "epoch": 0.6051550242809115, + "grad_norm": 0.3236999213695526, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 810 + }, + { + "epoch": 0.6126260739633919, + "grad_norm": 0.2892923653125763, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 820 + }, + { + "epoch": 0.6200971236458722, + "grad_norm": 0.4098321497440338, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 830 + }, + { + "epoch": 0.6275681733283527, + "grad_norm": 0.3337118923664093, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 840 + }, + { + "epoch": 0.635039223010833, + "grad_norm": 0.30416029691696167, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 850 + }, + { + "epoch": 0.6425102726933134, + "grad_norm": 0.3361026346683502, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 860 + }, + { + "epoch": 0.6499813223757938, + "grad_norm": 0.3537365198135376, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 870 + }, + { + "epoch": 0.6574523720582742, + "grad_norm": 0.33854469656944275, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 880 + }, + { + "epoch": 0.6649234217407546, + "grad_norm": 0.3332272469997406, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 890 + }, + { + "epoch": 0.6723944714232349, + "grad_norm": 0.34954726696014404, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 900 + }, + { + "epoch": 0.6798655211057153, + "grad_norm": 0.2921750247478485, + "learning_rate": 0.0002, + "loss": 1.7917, + "step": 910 + }, + { + "epoch": 0.6873365707881958, + "grad_norm": 0.30508682131767273, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 920 + }, + { + "epoch": 0.6948076204706761, + "grad_norm": 0.32268425822257996, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 930 + }, + { + "epoch": 0.7022786701531565, + "grad_norm": 0.2844390869140625, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 940 + }, + { + "epoch": 0.709749719835637, + "grad_norm": 0.31263890862464905, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 950 + }, + { + "epoch": 0.7172207695181173, + "grad_norm": 0.3626808822154999, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 960 + }, + { + "epoch": 0.7246918192005977, + "grad_norm": 0.3322749733924866, + "learning_rate": 0.0002, + "loss": 1.853, + "step": 970 + }, + { + "epoch": 0.732162868883078, + "grad_norm": 0.29177871346473694, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 980 + }, + { + "epoch": 0.7396339185655585, + "grad_norm": 0.35405513644218445, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 990 + }, + { + "epoch": 0.7471049682480388, + "grad_norm": 0.39318400621414185, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1000 + }, + { + "epoch": 0.7545760179305192, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1010 + }, + { + "epoch": 0.7620470676129997, + "grad_norm": 0.3271748721599579, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 1020 + }, + { + "epoch": 0.76951811729548, + "grad_norm": 0.30883970856666565, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1030 + }, + { + "epoch": 0.7769891669779604, + "grad_norm": 0.3411838412284851, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 1040 + }, + { + "epoch": 0.7844602166604407, + "grad_norm": 0.30608129501342773, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 1050 + }, + { + "epoch": 0.7919312663429212, + "grad_norm": 0.30899080634117126, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 1060 + }, + { + "epoch": 0.7994023160254016, + "grad_norm": 0.3160453140735626, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 1070 + }, + { + "epoch": 0.8068733657078819, + "grad_norm": 0.30947187542915344, + "learning_rate": 0.0002, + "loss": 1.8452, + "step": 1080 + }, + { + "epoch": 0.8143444153903624, + "grad_norm": 0.3103134036064148, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1090 + }, + { + "epoch": 0.8218154650728428, + "grad_norm": 0.31771138310432434, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 1100 + }, + { + "epoch": 0.8292865147553231, + "grad_norm": 0.5860997438430786, + "learning_rate": 0.0002, + "loss": 1.7918, + "step": 1110 + }, + { + "epoch": 0.8367575644378035, + "grad_norm": 0.3230148255825043, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 1120 + }, + { + "epoch": 0.8442286141202839, + "grad_norm": 0.29611510038375854, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 1130 + }, + { + "epoch": 0.8516996638027643, + "grad_norm": 0.3373654782772064, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 1140 + }, + { + "epoch": 0.8591707134852447, + "grad_norm": 0.3474279046058655, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1150 + }, + { + "epoch": 0.866641763167725, + "grad_norm": 0.35057875514030457, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1160 + }, + { + "epoch": 0.8741128128502055, + "grad_norm": 0.39537495374679565, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 1170 + }, + { + "epoch": 0.8815838625326858, + "grad_norm": 0.3714233636856079, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1180 + }, + { + "epoch": 0.8890549122151662, + "grad_norm": 0.2950296998023987, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1190 + }, + { + "epoch": 0.8965259618976467, + "grad_norm": 0.38182979822158813, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 1200 + }, + { + "epoch": 0.903997011580127, + "grad_norm": 0.27883678674697876, + "learning_rate": 0.0002, + "loss": 1.827, + "step": 1210 + }, + { + "epoch": 0.9114680612626074, + "grad_norm": 0.33874374628067017, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1220 + }, + { + "epoch": 0.9189391109450877, + "grad_norm": 0.3014272153377533, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1230 + }, + { + "epoch": 0.9264101606275682, + "grad_norm": 0.3194271922111511, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 1240 + }, + { + "epoch": 0.9338812103100486, + "grad_norm": 0.3049403429031372, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1250 + }, + { + "epoch": 0.9413522599925289, + "grad_norm": 0.30621254444122314, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 1260 + }, + { + "epoch": 0.9488233096750094, + "grad_norm": 0.28675132989883423, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 1270 + }, + { + "epoch": 0.9562943593574897, + "grad_norm": 0.3322032690048218, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1280 + }, + { + "epoch": 0.9637654090399701, + "grad_norm": 0.35408294200897217, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1290 + }, + { + "epoch": 0.9712364587224505, + "grad_norm": 0.36386919021606445, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1300 + }, + { + "epoch": 0.9787075084049309, + "grad_norm": 0.32338324189186096, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 1310 + }, + { + "epoch": 0.9861785580874113, + "grad_norm": 0.3714013993740082, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 1320 + }, + { + "epoch": 0.9936496077698916, + "grad_norm": 0.3133082389831543, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 1330 + }, + { + "epoch": 0.9996264475158759, + "eval_loss": 1.8051470518112183, + "eval_runtime": 38.6332, + "eval_samples_per_second": 13.331, + "eval_steps_per_second": 1.682, + "step": 1338 + } + ], + "logging_steps": 10, + "max_steps": 10704, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.194278297711411e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f512b3457d4ef0662b10279451f78e55458a70a9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8767e0d0604ccff5925fd06b1ebf5a468436644640dabefecdff8b512facf8aa +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b07ea83cdced8ab88316d0a2aaf9c0350dab2963 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7853c5f0c85c20e1969da93901ceeec41a9e7e3c7cdea7b0086247ba5bce952 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..159de3c223247df116f6bba8651f19728f842ff1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd7a777006d9d84e8e167a92deaa029dbda5c325805a5aee24ceddb0c3465d4a +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..15d7adb63848557720d420168a0662fb562a353c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5948bcf2b8a11466457f31ca0ea8be19b105a5e0386ed276110793dbcdcaaf44 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b655c58fe7925c016a2d13d8dce2390ae7c6ef34 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/trainer_state.json @@ -0,0 +1,1918 @@ +{ + "best_metric": 1.8046749830245972, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2677, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007471049682480389, + "grad_norm": 0.4912872612476349, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 10 + }, + { + "epoch": 0.014942099364960777, + "grad_norm": 0.4856316149234772, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 20 + }, + { + "epoch": 0.022413149047441166, + "grad_norm": 0.47683125734329224, + "learning_rate": 0.0002, + "loss": 2.0957, + "step": 30 + }, + { + "epoch": 0.029884198729921554, + "grad_norm": 0.515082597732544, + "learning_rate": 0.0002, + "loss": 1.8908, + "step": 40 + }, + { + "epoch": 0.03735524841240194, + "grad_norm": 0.5299215316772461, + "learning_rate": 0.0002, + "loss": 1.9704, + "step": 50 + }, + { + "epoch": 0.04482629809488233, + "grad_norm": 0.4951399862766266, + "learning_rate": 0.0002, + "loss": 1.9225, + "step": 60 + }, + { + "epoch": 0.05229734777736272, + "grad_norm": 0.48079821467399597, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05976839745984311, + "grad_norm": 0.49402132630348206, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 80 + }, + { + "epoch": 0.0672394471423235, + "grad_norm": 0.4778193235397339, + "learning_rate": 0.0002, + "loss": 1.8691, + "step": 90 + }, + { + "epoch": 0.07471049682480388, + "grad_norm": 0.42472657561302185, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 100 + }, + { + "epoch": 0.08218154650728428, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0002, + "loss": 1.8744, + "step": 110 + }, + { + "epoch": 0.08965259618976466, + "grad_norm": 0.4472862780094147, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 120 + }, + { + "epoch": 0.09712364587224505, + "grad_norm": 0.42596298456192017, + "learning_rate": 0.0002, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.10459469555472543, + "grad_norm": 0.46645811200141907, + "learning_rate": 0.0002, + "loss": 1.8015, + "step": 140 + }, + { + "epoch": 0.11206574523720583, + "grad_norm": 0.41041234135627747, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 150 + }, + { + "epoch": 0.11953679491968622, + "grad_norm": 0.5329819917678833, + "learning_rate": 0.0002, + "loss": 1.8276, + "step": 160 + }, + { + "epoch": 0.1270078446021666, + "grad_norm": 0.4065922200679779, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 170 + }, + { + "epoch": 0.134478894284647, + "grad_norm": 0.38406994938850403, + "learning_rate": 0.0002, + "loss": 1.8559, + "step": 180 + }, + { + "epoch": 0.14194994396712737, + "grad_norm": 0.4246881306171417, + "learning_rate": 0.0002, + "loss": 1.8647, + "step": 190 + }, + { + "epoch": 0.14942099364960776, + "grad_norm": 0.35136649012565613, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 200 + }, + { + "epoch": 0.15689204333208817, + "grad_norm": 0.43252742290496826, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.16436309301456856, + "grad_norm": 0.39236941933631897, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 220 + }, + { + "epoch": 0.17183414269704894, + "grad_norm": 0.3748249113559723, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 230 + }, + { + "epoch": 0.17930519237952933, + "grad_norm": 0.6432855725288391, + "learning_rate": 0.0002, + "loss": 1.866, + "step": 240 + }, + { + "epoch": 0.1867762420620097, + "grad_norm": 0.34874802827835083, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 250 + }, + { + "epoch": 0.1942472917444901, + "grad_norm": 0.3721984326839447, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 260 + }, + { + "epoch": 0.20171834142697048, + "grad_norm": 0.4339311420917511, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 270 + }, + { + "epoch": 0.20918939110945087, + "grad_norm": 0.4018215537071228, + "learning_rate": 0.0002, + "loss": 1.8665, + "step": 280 + }, + { + "epoch": 0.21666044079193125, + "grad_norm": 0.3278839886188507, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 290 + }, + { + "epoch": 0.22413149047441167, + "grad_norm": 0.36146077513694763, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 300 + }, + { + "epoch": 0.23160254015689205, + "grad_norm": 0.38175010681152344, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 310 + }, + { + "epoch": 0.23907358983937244, + "grad_norm": 0.44776618480682373, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.24654463952185282, + "grad_norm": 0.3933652937412262, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 330 + }, + { + "epoch": 0.2540156892043332, + "grad_norm": 0.3515005111694336, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 340 + }, + { + "epoch": 0.2614867388868136, + "grad_norm": 0.6683304309844971, + "learning_rate": 0.0002, + "loss": 1.8653, + "step": 350 + }, + { + "epoch": 0.268957788569294, + "grad_norm": 0.37093454599380493, + "learning_rate": 0.0002, + "loss": 1.8797, + "step": 360 + }, + { + "epoch": 0.2764288382517744, + "grad_norm": 0.3450651168823242, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 370 + }, + { + "epoch": 0.28389988793425475, + "grad_norm": 0.5140917301177979, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 380 + }, + { + "epoch": 0.29137093761673516, + "grad_norm": 0.32885563373565674, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 390 + }, + { + "epoch": 0.2988419872992155, + "grad_norm": 0.33962297439575195, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.30631303698169593, + "grad_norm": 0.3723141849040985, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 410 + }, + { + "epoch": 0.31378408666417634, + "grad_norm": 0.37173134088516235, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 420 + }, + { + "epoch": 0.3212551363466567, + "grad_norm": 0.33736956119537354, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 430 + }, + { + "epoch": 0.3287261860291371, + "grad_norm": 0.3602448105812073, + "learning_rate": 0.0002, + "loss": 1.8367, + "step": 440 + }, + { + "epoch": 0.33619723571161747, + "grad_norm": 0.3569699227809906, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 450 + }, + { + "epoch": 0.3436682853940979, + "grad_norm": 0.31009167432785034, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 460 + }, + { + "epoch": 0.35113933507657824, + "grad_norm": 0.5278693437576294, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 470 + }, + { + "epoch": 0.35861038475905865, + "grad_norm": 0.3587537109851837, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 480 + }, + { + "epoch": 0.366081434441539, + "grad_norm": 0.3859670162200928, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 490 + }, + { + "epoch": 0.3735524841240194, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 500 + }, + { + "epoch": 0.38102353380649984, + "grad_norm": 0.35052940249443054, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 510 + }, + { + "epoch": 0.3884945834889802, + "grad_norm": 0.2979494333267212, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 520 + }, + { + "epoch": 0.3959656331714606, + "grad_norm": 0.3062683343887329, + "learning_rate": 0.0002, + "loss": 1.8641, + "step": 530 + }, + { + "epoch": 0.40343668285394096, + "grad_norm": 0.3172847330570221, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 540 + }, + { + "epoch": 0.4109077325364214, + "grad_norm": 0.360435426235199, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 550 + }, + { + "epoch": 0.41837878221890173, + "grad_norm": 0.3427872359752655, + "learning_rate": 0.0002, + "loss": 1.9054, + "step": 560 + }, + { + "epoch": 0.42584983190138215, + "grad_norm": 0.34036558866500854, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 570 + }, + { + "epoch": 0.4333208815838625, + "grad_norm": 0.3365345299243927, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 580 + }, + { + "epoch": 0.4407919312663429, + "grad_norm": 0.35619041323661804, + "learning_rate": 0.0002, + "loss": 1.8328, + "step": 590 + }, + { + "epoch": 0.44826298094882333, + "grad_norm": 0.3569088280200958, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 600 + }, + { + "epoch": 0.4557340306313037, + "grad_norm": 0.3581278622150421, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 610 + }, + { + "epoch": 0.4632050803137841, + "grad_norm": 0.43197110295295715, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 620 + }, + { + "epoch": 0.47067612999626446, + "grad_norm": 0.33966198563575745, + "learning_rate": 0.0002, + "loss": 1.8257, + "step": 630 + }, + { + "epoch": 0.47814717967874487, + "grad_norm": 0.3343866467475891, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 640 + }, + { + "epoch": 0.48561822936122523, + "grad_norm": 0.33878564834594727, + "learning_rate": 0.0002, + "loss": 1.8191, + "step": 650 + }, + { + "epoch": 0.49308927904370564, + "grad_norm": 0.387195885181427, + "learning_rate": 0.0002, + "loss": 1.8801, + "step": 660 + }, + { + "epoch": 0.500560328726186, + "grad_norm": 0.3755440413951874, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 670 + }, + { + "epoch": 0.5080313784086664, + "grad_norm": 0.3272816836833954, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 680 + }, + { + "epoch": 0.5155024280911468, + "grad_norm": 0.36063864827156067, + "learning_rate": 0.0002, + "loss": 1.8156, + "step": 690 + }, + { + "epoch": 0.5229734777736272, + "grad_norm": 0.35317373275756836, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 700 + }, + { + "epoch": 0.5304445274561076, + "grad_norm": 0.3561195433139801, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 710 + }, + { + "epoch": 0.537915577138588, + "grad_norm": 0.31124624609947205, + "learning_rate": 0.0002, + "loss": 1.8149, + "step": 720 + }, + { + "epoch": 0.5453866268210683, + "grad_norm": 0.3294544517993927, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 730 + }, + { + "epoch": 0.5528576765035488, + "grad_norm": 0.31933900713920593, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 740 + }, + { + "epoch": 0.5603287261860291, + "grad_norm": 0.3226020634174347, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 750 + }, + { + "epoch": 0.5677997758685095, + "grad_norm": 0.3147525489330292, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 760 + }, + { + "epoch": 0.57527082555099, + "grad_norm": 0.32234328985214233, + "learning_rate": 0.0002, + "loss": 1.9028, + "step": 770 + }, + { + "epoch": 0.5827418752334703, + "grad_norm": 0.3258664309978485, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 780 + }, + { + "epoch": 0.5902129249159507, + "grad_norm": 0.3166961967945099, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 790 + }, + { + "epoch": 0.597683974598431, + "grad_norm": 0.35621458292007446, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 800 + }, + { + "epoch": 0.6051550242809115, + "grad_norm": 0.3236999213695526, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 810 + }, + { + "epoch": 0.6126260739633919, + "grad_norm": 0.2892923653125763, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 820 + }, + { + "epoch": 0.6200971236458722, + "grad_norm": 0.4098321497440338, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 830 + }, + { + "epoch": 0.6275681733283527, + "grad_norm": 0.3337118923664093, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 840 + }, + { + "epoch": 0.635039223010833, + "grad_norm": 0.30416029691696167, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 850 + }, + { + "epoch": 0.6425102726933134, + "grad_norm": 0.3361026346683502, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 860 + }, + { + "epoch": 0.6499813223757938, + "grad_norm": 0.3537365198135376, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 870 + }, + { + "epoch": 0.6574523720582742, + "grad_norm": 0.33854469656944275, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 880 + }, + { + "epoch": 0.6649234217407546, + "grad_norm": 0.3332272469997406, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 890 + }, + { + "epoch": 0.6723944714232349, + "grad_norm": 0.34954726696014404, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 900 + }, + { + "epoch": 0.6798655211057153, + "grad_norm": 0.2921750247478485, + "learning_rate": 0.0002, + "loss": 1.7917, + "step": 910 + }, + { + "epoch": 0.6873365707881958, + "grad_norm": 0.30508682131767273, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 920 + }, + { + "epoch": 0.6948076204706761, + "grad_norm": 0.32268425822257996, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 930 + }, + { + "epoch": 0.7022786701531565, + "grad_norm": 0.2844390869140625, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 940 + }, + { + "epoch": 0.709749719835637, + "grad_norm": 0.31263890862464905, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 950 + }, + { + "epoch": 0.7172207695181173, + "grad_norm": 0.3626808822154999, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 960 + }, + { + "epoch": 0.7246918192005977, + "grad_norm": 0.3322749733924866, + "learning_rate": 0.0002, + "loss": 1.853, + "step": 970 + }, + { + "epoch": 0.732162868883078, + "grad_norm": 0.29177871346473694, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 980 + }, + { + "epoch": 0.7396339185655585, + "grad_norm": 0.35405513644218445, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 990 + }, + { + "epoch": 0.7471049682480388, + "grad_norm": 0.39318400621414185, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1000 + }, + { + "epoch": 0.7545760179305192, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1010 + }, + { + "epoch": 0.7620470676129997, + "grad_norm": 0.3271748721599579, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 1020 + }, + { + "epoch": 0.76951811729548, + "grad_norm": 0.30883970856666565, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1030 + }, + { + "epoch": 0.7769891669779604, + "grad_norm": 0.3411838412284851, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 1040 + }, + { + "epoch": 0.7844602166604407, + "grad_norm": 0.30608129501342773, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 1050 + }, + { + "epoch": 0.7919312663429212, + "grad_norm": 0.30899080634117126, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 1060 + }, + { + "epoch": 0.7994023160254016, + "grad_norm": 0.3160453140735626, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 1070 + }, + { + "epoch": 0.8068733657078819, + "grad_norm": 0.30947187542915344, + "learning_rate": 0.0002, + "loss": 1.8452, + "step": 1080 + }, + { + "epoch": 0.8143444153903624, + "grad_norm": 0.3103134036064148, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1090 + }, + { + "epoch": 0.8218154650728428, + "grad_norm": 0.31771138310432434, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 1100 + }, + { + "epoch": 0.8292865147553231, + "grad_norm": 0.5860997438430786, + "learning_rate": 0.0002, + "loss": 1.7918, + "step": 1110 + }, + { + "epoch": 0.8367575644378035, + "grad_norm": 0.3230148255825043, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 1120 + }, + { + "epoch": 0.8442286141202839, + "grad_norm": 0.29611510038375854, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 1130 + }, + { + "epoch": 0.8516996638027643, + "grad_norm": 0.3373654782772064, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 1140 + }, + { + "epoch": 0.8591707134852447, + "grad_norm": 0.3474279046058655, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1150 + }, + { + "epoch": 0.866641763167725, + "grad_norm": 0.35057875514030457, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1160 + }, + { + "epoch": 0.8741128128502055, + "grad_norm": 0.39537495374679565, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 1170 + }, + { + "epoch": 0.8815838625326858, + "grad_norm": 0.3714233636856079, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1180 + }, + { + "epoch": 0.8890549122151662, + "grad_norm": 0.2950296998023987, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1190 + }, + { + "epoch": 0.8965259618976467, + "grad_norm": 0.38182979822158813, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 1200 + }, + { + "epoch": 0.903997011580127, + "grad_norm": 0.27883678674697876, + "learning_rate": 0.0002, + "loss": 1.827, + "step": 1210 + }, + { + "epoch": 0.9114680612626074, + "grad_norm": 0.33874374628067017, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1220 + }, + { + "epoch": 0.9189391109450877, + "grad_norm": 0.3014272153377533, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1230 + }, + { + "epoch": 0.9264101606275682, + "grad_norm": 0.3194271922111511, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 1240 + }, + { + "epoch": 0.9338812103100486, + "grad_norm": 0.3049403429031372, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1250 + }, + { + "epoch": 0.9413522599925289, + "grad_norm": 0.30621254444122314, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 1260 + }, + { + "epoch": 0.9488233096750094, + "grad_norm": 0.28675132989883423, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 1270 + }, + { + "epoch": 0.9562943593574897, + "grad_norm": 0.3322032690048218, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1280 + }, + { + "epoch": 0.9637654090399701, + "grad_norm": 0.35408294200897217, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1290 + }, + { + "epoch": 0.9712364587224505, + "grad_norm": 0.36386919021606445, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1300 + }, + { + "epoch": 0.9787075084049309, + "grad_norm": 0.32338324189186096, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 1310 + }, + { + "epoch": 0.9861785580874113, + "grad_norm": 0.3714013993740082, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 1320 + }, + { + "epoch": 0.9936496077698916, + "grad_norm": 0.3133082389831543, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 1330 + }, + { + "epoch": 0.9996264475158759, + "eval_loss": 1.8051470518112183, + "eval_runtime": 38.6332, + "eval_samples_per_second": 13.331, + "eval_steps_per_second": 1.682, + "step": 1338 + }, + { + "epoch": 1.001120657452372, + "grad_norm": 0.31595754623413086, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 1340 + }, + { + "epoch": 1.0085917071348525, + "grad_norm": 0.3095700144767761, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1350 + }, + { + "epoch": 1.0160627568173328, + "grad_norm": 0.34677496552467346, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1360 + }, + { + "epoch": 1.0235338064998132, + "grad_norm": 0.29108840227127075, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1370 + }, + { + "epoch": 1.0310048561822935, + "grad_norm": 0.32356950640678406, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 1380 + }, + { + "epoch": 1.038475905864774, + "grad_norm": 0.4200669229030609, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1390 + }, + { + "epoch": 1.0459469555472545, + "grad_norm": 0.3283711373806, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 1400 + }, + { + "epoch": 1.0534180052297348, + "grad_norm": 0.32898256182670593, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1410 + }, + { + "epoch": 1.0608890549122152, + "grad_norm": 0.38790300488471985, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 1420 + }, + { + "epoch": 1.0683601045946955, + "grad_norm": 0.339800089597702, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1430 + }, + { + "epoch": 1.075831154277176, + "grad_norm": 0.3548751175403595, + "learning_rate": 0.0002, + "loss": 1.7076, + "step": 1440 + }, + { + "epoch": 1.0833022039596563, + "grad_norm": 0.35114359855651855, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1450 + }, + { + "epoch": 1.0907732536421366, + "grad_norm": 0.35226720571517944, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 1460 + }, + { + "epoch": 1.0982443033246172, + "grad_norm": 0.33665576577186584, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 1470 + }, + { + "epoch": 1.1057153530070976, + "grad_norm": 0.363889217376709, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1480 + }, + { + "epoch": 1.113186402689578, + "grad_norm": 0.3826201856136322, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 1490 + }, + { + "epoch": 1.1206574523720583, + "grad_norm": 0.34058740735054016, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 1500 + }, + { + "epoch": 1.1281285020545386, + "grad_norm": 0.3462134301662445, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1510 + }, + { + "epoch": 1.135599551737019, + "grad_norm": 0.3396756052970886, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 1520 + }, + { + "epoch": 1.1430706014194993, + "grad_norm": 0.32004743814468384, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1530 + }, + { + "epoch": 1.15054165110198, + "grad_norm": 0.3397733271121979, + "learning_rate": 0.0002, + "loss": 1.743, + "step": 1540 + }, + { + "epoch": 1.1580127007844603, + "grad_norm": 0.3783262073993683, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 1550 + }, + { + "epoch": 1.1654837504669406, + "grad_norm": 0.35121291875839233, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1560 + }, + { + "epoch": 1.172954800149421, + "grad_norm": 0.35816895961761475, + "learning_rate": 0.0002, + "loss": 1.678, + "step": 1570 + }, + { + "epoch": 1.1804258498319014, + "grad_norm": 0.33843839168548584, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1580 + }, + { + "epoch": 1.1878968995143817, + "grad_norm": 0.3371972143650055, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 1590 + }, + { + "epoch": 1.195367949196862, + "grad_norm": 0.36016878485679626, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 1600 + }, + { + "epoch": 1.2028389988793426, + "grad_norm": 0.40879473090171814, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 1610 + }, + { + "epoch": 1.210310048561823, + "grad_norm": 0.3216715455055237, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 1620 + }, + { + "epoch": 1.2177810982443034, + "grad_norm": 0.4482610821723938, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1630 + }, + { + "epoch": 1.2252521479267837, + "grad_norm": 0.3257700502872467, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1640 + }, + { + "epoch": 1.232723197609264, + "grad_norm": 0.38646459579467773, + "learning_rate": 0.0002, + "loss": 1.7177, + "step": 1650 + }, + { + "epoch": 1.2401942472917444, + "grad_norm": 0.4081360697746277, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1660 + }, + { + "epoch": 1.2476652969742248, + "grad_norm": 0.4326848089694977, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 1670 + }, + { + "epoch": 1.2551363466567054, + "grad_norm": 0.346401572227478, + "learning_rate": 0.0002, + "loss": 1.6752, + "step": 1680 + }, + { + "epoch": 1.2626073963391857, + "grad_norm": 0.34536251425743103, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1690 + }, + { + "epoch": 1.270078446021666, + "grad_norm": 0.41359591484069824, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 1700 + }, + { + "epoch": 1.2775494957041464, + "grad_norm": 0.3530874252319336, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 1710 + }, + { + "epoch": 1.2850205453866268, + "grad_norm": 0.3702719211578369, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 1720 + }, + { + "epoch": 1.2924915950691072, + "grad_norm": 0.3703329563140869, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1730 + }, + { + "epoch": 1.2999626447515875, + "grad_norm": 0.37919729948043823, + "learning_rate": 0.0002, + "loss": 1.7221, + "step": 1740 + }, + { + "epoch": 1.307433694434068, + "grad_norm": 0.32526856660842896, + "learning_rate": 0.0002, + "loss": 1.7859, + "step": 1750 + }, + { + "epoch": 1.3149047441165485, + "grad_norm": 0.36752620339393616, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1760 + }, + { + "epoch": 1.3223757937990288, + "grad_norm": 0.3398192524909973, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1770 + }, + { + "epoch": 1.3298468434815092, + "grad_norm": 0.37435585260391235, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1780 + }, + { + "epoch": 1.3373178931639895, + "grad_norm": 0.35793280601501465, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1790 + }, + { + "epoch": 1.3447889428464699, + "grad_norm": 0.35481882095336914, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1800 + }, + { + "epoch": 1.3522599925289502, + "grad_norm": 0.3786393105983734, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1810 + }, + { + "epoch": 1.3597310422114308, + "grad_norm": 0.33245593309402466, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1820 + }, + { + "epoch": 1.3672020918939112, + "grad_norm": 0.35388344526290894, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1830 + }, + { + "epoch": 1.3746731415763915, + "grad_norm": 0.3695325553417206, + "learning_rate": 0.0002, + "loss": 1.6968, + "step": 1840 + }, + { + "epoch": 1.382144191258872, + "grad_norm": 0.3683604598045349, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1850 + }, + { + "epoch": 1.3896152409413522, + "grad_norm": 0.3753012418746948, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1860 + }, + { + "epoch": 1.3970862906238326, + "grad_norm": 0.3331069350242615, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 1870 + }, + { + "epoch": 1.404557340306313, + "grad_norm": 0.3877500295639038, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 1880 + }, + { + "epoch": 1.4120283899887935, + "grad_norm": 0.33525151014328003, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1890 + }, + { + "epoch": 1.4194994396712737, + "grad_norm": 0.3697299659252167, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1900 + }, + { + "epoch": 1.4269704893537543, + "grad_norm": 0.4029286205768585, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1910 + }, + { + "epoch": 1.4344415390362346, + "grad_norm": 0.3596203029155731, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 1920 + }, + { + "epoch": 1.441912588718715, + "grad_norm": 0.450783908367157, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 1930 + }, + { + "epoch": 1.4493836384011953, + "grad_norm": 0.3651481866836548, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1940 + }, + { + "epoch": 1.4568546880836757, + "grad_norm": 0.3608424663543701, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 1950 + }, + { + "epoch": 1.4643257377661563, + "grad_norm": 0.39684420824050903, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 1960 + }, + { + "epoch": 1.4717967874486364, + "grad_norm": 0.34618663787841797, + "learning_rate": 0.0002, + "loss": 1.7514, + "step": 1970 + }, + { + "epoch": 1.479267837131117, + "grad_norm": 0.4150386452674866, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1980 + }, + { + "epoch": 1.4867388868135973, + "grad_norm": 0.35500776767730713, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1990 + }, + { + "epoch": 1.4942099364960777, + "grad_norm": 0.344144344329834, + "learning_rate": 0.0002, + "loss": 1.7322, + "step": 2000 + }, + { + "epoch": 1.501680986178558, + "grad_norm": 0.3340149223804474, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2010 + }, + { + "epoch": 1.5091520358610384, + "grad_norm": 0.37685006856918335, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 2020 + }, + { + "epoch": 1.516623085543519, + "grad_norm": 0.3699876368045807, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 2030 + }, + { + "epoch": 1.5240941352259991, + "grad_norm": 0.3370307385921478, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 2040 + }, + { + "epoch": 1.5315651849084797, + "grad_norm": 0.37780630588531494, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 2050 + }, + { + "epoch": 1.53903623459096, + "grad_norm": 0.370259165763855, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 2060 + }, + { + "epoch": 1.5465072842734404, + "grad_norm": 0.3440011441707611, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 2070 + }, + { + "epoch": 1.5539783339559208, + "grad_norm": 0.40382063388824463, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 2080 + }, + { + "epoch": 1.5614493836384011, + "grad_norm": 0.38002029061317444, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 2090 + }, + { + "epoch": 1.5689204333208817, + "grad_norm": 0.3658451437950134, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2100 + }, + { + "epoch": 1.5763914830033618, + "grad_norm": 0.354842871427536, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 2110 + }, + { + "epoch": 1.5838625326858424, + "grad_norm": 0.34735530614852905, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 2120 + }, + { + "epoch": 1.5913335823683228, + "grad_norm": 0.377581924200058, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 2130 + }, + { + "epoch": 1.5988046320508031, + "grad_norm": 0.41254034638404846, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 2140 + }, + { + "epoch": 1.6062756817332835, + "grad_norm": 0.3630715310573578, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2150 + }, + { + "epoch": 1.6137467314157639, + "grad_norm": 0.36980143189430237, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 2160 + }, + { + "epoch": 1.6212177810982444, + "grad_norm": 0.3634769320487976, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2170 + }, + { + "epoch": 1.6286888307807246, + "grad_norm": 0.3794139623641968, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2180 + }, + { + "epoch": 1.6361598804632052, + "grad_norm": 0.359742134809494, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 2190 + }, + { + "epoch": 1.6436309301456855, + "grad_norm": 0.3770543932914734, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.6511019798281659, + "grad_norm": 0.3797036409378052, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 2210 + }, + { + "epoch": 1.6585730295106462, + "grad_norm": 0.35622093081474304, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 2220 + }, + { + "epoch": 1.6660440791931266, + "grad_norm": 0.34552520513534546, + "learning_rate": 0.0002, + "loss": 1.6615, + "step": 2230 + }, + { + "epoch": 1.6735151288756072, + "grad_norm": 0.379926860332489, + "learning_rate": 0.0002, + "loss": 1.7522, + "step": 2240 + }, + { + "epoch": 1.6809861785580873, + "grad_norm": 0.37083810567855835, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 2250 + }, + { + "epoch": 1.6884572282405679, + "grad_norm": 0.42746543884277344, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 2260 + }, + { + "epoch": 1.6959282779230482, + "grad_norm": 0.3372884690761566, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2270 + }, + { + "epoch": 1.7033993276055286, + "grad_norm": 0.35220256447792053, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2280 + }, + { + "epoch": 1.710870377288009, + "grad_norm": 0.3659130930900574, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 2290 + }, + { + "epoch": 1.7183414269704893, + "grad_norm": 0.37629297375679016, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2300 + }, + { + "epoch": 1.7258124766529699, + "grad_norm": 0.36312398314476013, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2310 + }, + { + "epoch": 1.73328352633545, + "grad_norm": 0.467709481716156, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 2320 + }, + { + "epoch": 1.7407545760179306, + "grad_norm": 0.38685527443885803, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2330 + }, + { + "epoch": 1.748225625700411, + "grad_norm": 0.3578338325023651, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 2340 + }, + { + "epoch": 1.7556966753828913, + "grad_norm": 0.36057502031326294, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2350 + }, + { + "epoch": 1.7631677250653717, + "grad_norm": 0.3615196645259857, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2360 + }, + { + "epoch": 1.770638774747852, + "grad_norm": 0.4118947684764862, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 2370 + }, + { + "epoch": 1.7781098244303326, + "grad_norm": 0.4067276120185852, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2380 + }, + { + "epoch": 1.7855808741128127, + "grad_norm": 0.3979823887348175, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2390 + }, + { + "epoch": 1.7930519237952933, + "grad_norm": 0.44045883417129517, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 2400 + }, + { + "epoch": 1.8005229734777737, + "grad_norm": 0.3998069167137146, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 2410 + }, + { + "epoch": 1.807994023160254, + "grad_norm": 0.3450094759464264, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 2420 + }, + { + "epoch": 1.8154650728427344, + "grad_norm": 0.3759009838104248, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2430 + }, + { + "epoch": 1.8229361225252148, + "grad_norm": 0.34347015619277954, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2440 + }, + { + "epoch": 1.8304071722076953, + "grad_norm": 0.3511228859424591, + "learning_rate": 0.0002, + "loss": 1.7345, + "step": 2450 + }, + { + "epoch": 1.8378782218901755, + "grad_norm": 0.36853715777397156, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 2460 + }, + { + "epoch": 1.845349271572656, + "grad_norm": 0.40659376978874207, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2470 + }, + { + "epoch": 1.8528203212551362, + "grad_norm": 0.39621320366859436, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 2480 + }, + { + "epoch": 1.8602913709376168, + "grad_norm": 0.3753979504108429, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 2490 + }, + { + "epoch": 1.8677624206200971, + "grad_norm": 0.3811938464641571, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2500 + }, + { + "epoch": 1.8752334703025775, + "grad_norm": 0.3432596027851105, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 2510 + }, + { + "epoch": 1.882704519985058, + "grad_norm": 0.3670712113380432, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 2520 + }, + { + "epoch": 1.8901755696675382, + "grad_norm": 0.40907177329063416, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2530 + }, + { + "epoch": 1.8976466193500188, + "grad_norm": 0.3821999728679657, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 2540 + }, + { + "epoch": 1.905117669032499, + "grad_norm": 0.36173978447914124, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 2550 + }, + { + "epoch": 1.9125887187149795, + "grad_norm": 0.38990336656570435, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 2560 + }, + { + "epoch": 1.9200597683974598, + "grad_norm": 0.35242322087287903, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 2570 + }, + { + "epoch": 1.9275308180799402, + "grad_norm": 0.3506428003311157, + "learning_rate": 0.0002, + "loss": 1.7268, + "step": 2580 + }, + { + "epoch": 1.9350018677624208, + "grad_norm": 0.39540135860443115, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2590 + }, + { + "epoch": 1.942472917444901, + "grad_norm": 0.3444725573062897, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2600 + }, + { + "epoch": 1.9499439671273815, + "grad_norm": 0.3963521718978882, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 2610 + }, + { + "epoch": 1.9574150168098616, + "grad_norm": 0.3689815402030945, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2620 + }, + { + "epoch": 1.9648860664923422, + "grad_norm": 0.3482626676559448, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 2630 + }, + { + "epoch": 1.9723571161748226, + "grad_norm": 0.35832616686820984, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2640 + }, + { + "epoch": 1.979828165857303, + "grad_norm": 0.4776208996772766, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2650 + }, + { + "epoch": 1.9872992155397835, + "grad_norm": 0.32570165395736694, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2660 + }, + { + "epoch": 1.9947702652222636, + "grad_norm": 0.3380725085735321, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2670 + }, + { + "epoch": 2.0, + "eval_loss": 1.8046749830245972, + "eval_runtime": 38.5096, + "eval_samples_per_second": 13.373, + "eval_steps_per_second": 1.688, + "step": 2677 + } + ], + "logging_steps": 10, + "max_steps": 10704, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2388556595422822e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4926b7ab82fda49f8d82c31524a0f039d62e0c6e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d478f3d289857e8c3bac6e8f78ca8e9abcdca9a67a0f9422f335d9f204ba81c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fcac5e5b1957a6ed6fa07c0ed71cfeff2f15a579 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d177228f65938693aca16f68ff150166213b5361c99e1eb61010f54f8a11485 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6937dae766b596444efa9130eff0caa9fad15f74 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815e355ea07f47424238b4562e23ff869072de740aa219e0094905863e498161 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..434a5ea562ea258150dbf9fa7244707ac6216167 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e3229c0eb3893b4ed9dc78c51b78922835046ad4263156ec195b41cd09021c9 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..08a91535c7d262f7eac4f74081b715ea9a9c9aa1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/trainer_state.json @@ -0,0 +1,2864 @@ +{ + "best_metric": 1.8046749830245972, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", + "epoch": 2.999626447515876, + "eval_steps": 10, + "global_step": 4015, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007471049682480389, + "grad_norm": 0.4912872612476349, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 10 + }, + { + "epoch": 0.014942099364960777, + "grad_norm": 0.4856316149234772, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 20 + }, + { + "epoch": 0.022413149047441166, + "grad_norm": 0.47683125734329224, + "learning_rate": 0.0002, + "loss": 2.0957, + "step": 30 + }, + { + "epoch": 0.029884198729921554, + "grad_norm": 0.515082597732544, + "learning_rate": 0.0002, + "loss": 1.8908, + "step": 40 + }, + { + "epoch": 0.03735524841240194, + "grad_norm": 0.5299215316772461, + "learning_rate": 0.0002, + "loss": 1.9704, + "step": 50 + }, + { + "epoch": 0.04482629809488233, + "grad_norm": 0.4951399862766266, + "learning_rate": 0.0002, + "loss": 1.9225, + "step": 60 + }, + { + "epoch": 0.05229734777736272, + "grad_norm": 0.48079821467399597, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05976839745984311, + "grad_norm": 0.49402132630348206, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 80 + }, + { + "epoch": 0.0672394471423235, + "grad_norm": 0.4778193235397339, + "learning_rate": 0.0002, + "loss": 1.8691, + "step": 90 + }, + { + "epoch": 0.07471049682480388, + "grad_norm": 0.42472657561302185, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 100 + }, + { + "epoch": 0.08218154650728428, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0002, + "loss": 1.8744, + "step": 110 + }, + { + "epoch": 0.08965259618976466, + "grad_norm": 0.4472862780094147, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 120 + }, + { + "epoch": 0.09712364587224505, + "grad_norm": 0.42596298456192017, + "learning_rate": 0.0002, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.10459469555472543, + "grad_norm": 0.46645811200141907, + "learning_rate": 0.0002, + "loss": 1.8015, + "step": 140 + }, + { + "epoch": 0.11206574523720583, + "grad_norm": 0.41041234135627747, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 150 + }, + { + "epoch": 0.11953679491968622, + "grad_norm": 0.5329819917678833, + "learning_rate": 0.0002, + "loss": 1.8276, + "step": 160 + }, + { + "epoch": 0.1270078446021666, + "grad_norm": 0.4065922200679779, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 170 + }, + { + "epoch": 0.134478894284647, + "grad_norm": 0.38406994938850403, + "learning_rate": 0.0002, + "loss": 1.8559, + "step": 180 + }, + { + "epoch": 0.14194994396712737, + "grad_norm": 0.4246881306171417, + "learning_rate": 0.0002, + "loss": 1.8647, + "step": 190 + }, + { + "epoch": 0.14942099364960776, + "grad_norm": 0.35136649012565613, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 200 + }, + { + "epoch": 0.15689204333208817, + "grad_norm": 0.43252742290496826, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.16436309301456856, + "grad_norm": 0.39236941933631897, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 220 + }, + { + "epoch": 0.17183414269704894, + "grad_norm": 0.3748249113559723, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 230 + }, + { + "epoch": 0.17930519237952933, + "grad_norm": 0.6432855725288391, + "learning_rate": 0.0002, + "loss": 1.866, + "step": 240 + }, + { + "epoch": 0.1867762420620097, + "grad_norm": 0.34874802827835083, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 250 + }, + { + "epoch": 0.1942472917444901, + "grad_norm": 0.3721984326839447, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 260 + }, + { + "epoch": 0.20171834142697048, + "grad_norm": 0.4339311420917511, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 270 + }, + { + "epoch": 0.20918939110945087, + "grad_norm": 0.4018215537071228, + "learning_rate": 0.0002, + "loss": 1.8665, + "step": 280 + }, + { + "epoch": 0.21666044079193125, + "grad_norm": 0.3278839886188507, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 290 + }, + { + "epoch": 0.22413149047441167, + "grad_norm": 0.36146077513694763, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 300 + }, + { + "epoch": 0.23160254015689205, + "grad_norm": 0.38175010681152344, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 310 + }, + { + "epoch": 0.23907358983937244, + "grad_norm": 0.44776618480682373, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.24654463952185282, + "grad_norm": 0.3933652937412262, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 330 + }, + { + "epoch": 0.2540156892043332, + "grad_norm": 0.3515005111694336, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 340 + }, + { + "epoch": 0.2614867388868136, + "grad_norm": 0.6683304309844971, + "learning_rate": 0.0002, + "loss": 1.8653, + "step": 350 + }, + { + "epoch": 0.268957788569294, + "grad_norm": 0.37093454599380493, + "learning_rate": 0.0002, + "loss": 1.8797, + "step": 360 + }, + { + "epoch": 0.2764288382517744, + "grad_norm": 0.3450651168823242, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 370 + }, + { + "epoch": 0.28389988793425475, + "grad_norm": 0.5140917301177979, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 380 + }, + { + "epoch": 0.29137093761673516, + "grad_norm": 0.32885563373565674, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 390 + }, + { + "epoch": 0.2988419872992155, + "grad_norm": 0.33962297439575195, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.30631303698169593, + "grad_norm": 0.3723141849040985, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 410 + }, + { + "epoch": 0.31378408666417634, + "grad_norm": 0.37173134088516235, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 420 + }, + { + "epoch": 0.3212551363466567, + "grad_norm": 0.33736956119537354, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 430 + }, + { + "epoch": 0.3287261860291371, + "grad_norm": 0.3602448105812073, + "learning_rate": 0.0002, + "loss": 1.8367, + "step": 440 + }, + { + "epoch": 0.33619723571161747, + "grad_norm": 0.3569699227809906, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 450 + }, + { + "epoch": 0.3436682853940979, + "grad_norm": 0.31009167432785034, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 460 + }, + { + "epoch": 0.35113933507657824, + "grad_norm": 0.5278693437576294, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 470 + }, + { + "epoch": 0.35861038475905865, + "grad_norm": 0.3587537109851837, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 480 + }, + { + "epoch": 0.366081434441539, + "grad_norm": 0.3859670162200928, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 490 + }, + { + "epoch": 0.3735524841240194, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 500 + }, + { + "epoch": 0.38102353380649984, + "grad_norm": 0.35052940249443054, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 510 + }, + { + "epoch": 0.3884945834889802, + "grad_norm": 0.2979494333267212, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 520 + }, + { + "epoch": 0.3959656331714606, + "grad_norm": 0.3062683343887329, + "learning_rate": 0.0002, + "loss": 1.8641, + "step": 530 + }, + { + "epoch": 0.40343668285394096, + "grad_norm": 0.3172847330570221, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 540 + }, + { + "epoch": 0.4109077325364214, + "grad_norm": 0.360435426235199, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 550 + }, + { + "epoch": 0.41837878221890173, + "grad_norm": 0.3427872359752655, + "learning_rate": 0.0002, + "loss": 1.9054, + "step": 560 + }, + { + "epoch": 0.42584983190138215, + "grad_norm": 0.34036558866500854, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 570 + }, + { + "epoch": 0.4333208815838625, + "grad_norm": 0.3365345299243927, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 580 + }, + { + "epoch": 0.4407919312663429, + "grad_norm": 0.35619041323661804, + "learning_rate": 0.0002, + "loss": 1.8328, + "step": 590 + }, + { + "epoch": 0.44826298094882333, + "grad_norm": 0.3569088280200958, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 600 + }, + { + "epoch": 0.4557340306313037, + "grad_norm": 0.3581278622150421, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 610 + }, + { + "epoch": 0.4632050803137841, + "grad_norm": 0.43197110295295715, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 620 + }, + { + "epoch": 0.47067612999626446, + "grad_norm": 0.33966198563575745, + "learning_rate": 0.0002, + "loss": 1.8257, + "step": 630 + }, + { + "epoch": 0.47814717967874487, + "grad_norm": 0.3343866467475891, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 640 + }, + { + "epoch": 0.48561822936122523, + "grad_norm": 0.33878564834594727, + "learning_rate": 0.0002, + "loss": 1.8191, + "step": 650 + }, + { + "epoch": 0.49308927904370564, + "grad_norm": 0.387195885181427, + "learning_rate": 0.0002, + "loss": 1.8801, + "step": 660 + }, + { + "epoch": 0.500560328726186, + "grad_norm": 0.3755440413951874, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 670 + }, + { + "epoch": 0.5080313784086664, + "grad_norm": 0.3272816836833954, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 680 + }, + { + "epoch": 0.5155024280911468, + "grad_norm": 0.36063864827156067, + "learning_rate": 0.0002, + "loss": 1.8156, + "step": 690 + }, + { + "epoch": 0.5229734777736272, + "grad_norm": 0.35317373275756836, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 700 + }, + { + "epoch": 0.5304445274561076, + "grad_norm": 0.3561195433139801, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 710 + }, + { + "epoch": 0.537915577138588, + "grad_norm": 0.31124624609947205, + "learning_rate": 0.0002, + "loss": 1.8149, + "step": 720 + }, + { + "epoch": 0.5453866268210683, + "grad_norm": 0.3294544517993927, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 730 + }, + { + "epoch": 0.5528576765035488, + "grad_norm": 0.31933900713920593, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 740 + }, + { + "epoch": 0.5603287261860291, + "grad_norm": 0.3226020634174347, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 750 + }, + { + "epoch": 0.5677997758685095, + "grad_norm": 0.3147525489330292, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 760 + }, + { + "epoch": 0.57527082555099, + "grad_norm": 0.32234328985214233, + "learning_rate": 0.0002, + "loss": 1.9028, + "step": 770 + }, + { + "epoch": 0.5827418752334703, + "grad_norm": 0.3258664309978485, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 780 + }, + { + "epoch": 0.5902129249159507, + "grad_norm": 0.3166961967945099, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 790 + }, + { + "epoch": 0.597683974598431, + "grad_norm": 0.35621458292007446, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 800 + }, + { + "epoch": 0.6051550242809115, + "grad_norm": 0.3236999213695526, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 810 + }, + { + "epoch": 0.6126260739633919, + "grad_norm": 0.2892923653125763, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 820 + }, + { + "epoch": 0.6200971236458722, + "grad_norm": 0.4098321497440338, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 830 + }, + { + "epoch": 0.6275681733283527, + "grad_norm": 0.3337118923664093, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 840 + }, + { + "epoch": 0.635039223010833, + "grad_norm": 0.30416029691696167, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 850 + }, + { + "epoch": 0.6425102726933134, + "grad_norm": 0.3361026346683502, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 860 + }, + { + "epoch": 0.6499813223757938, + "grad_norm": 0.3537365198135376, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 870 + }, + { + "epoch": 0.6574523720582742, + "grad_norm": 0.33854469656944275, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 880 + }, + { + "epoch": 0.6649234217407546, + "grad_norm": 0.3332272469997406, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 890 + }, + { + "epoch": 0.6723944714232349, + "grad_norm": 0.34954726696014404, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 900 + }, + { + "epoch": 0.6798655211057153, + "grad_norm": 0.2921750247478485, + "learning_rate": 0.0002, + "loss": 1.7917, + "step": 910 + }, + { + "epoch": 0.6873365707881958, + "grad_norm": 0.30508682131767273, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 920 + }, + { + "epoch": 0.6948076204706761, + "grad_norm": 0.32268425822257996, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 930 + }, + { + "epoch": 0.7022786701531565, + "grad_norm": 0.2844390869140625, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 940 + }, + { + "epoch": 0.709749719835637, + "grad_norm": 0.31263890862464905, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 950 + }, + { + "epoch": 0.7172207695181173, + "grad_norm": 0.3626808822154999, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 960 + }, + { + "epoch": 0.7246918192005977, + "grad_norm": 0.3322749733924866, + "learning_rate": 0.0002, + "loss": 1.853, + "step": 970 + }, + { + "epoch": 0.732162868883078, + "grad_norm": 0.29177871346473694, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 980 + }, + { + "epoch": 0.7396339185655585, + "grad_norm": 0.35405513644218445, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 990 + }, + { + "epoch": 0.7471049682480388, + "grad_norm": 0.39318400621414185, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1000 + }, + { + "epoch": 0.7545760179305192, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1010 + }, + { + "epoch": 0.7620470676129997, + "grad_norm": 0.3271748721599579, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 1020 + }, + { + "epoch": 0.76951811729548, + "grad_norm": 0.30883970856666565, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1030 + }, + { + "epoch": 0.7769891669779604, + "grad_norm": 0.3411838412284851, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 1040 + }, + { + "epoch": 0.7844602166604407, + "grad_norm": 0.30608129501342773, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 1050 + }, + { + "epoch": 0.7919312663429212, + "grad_norm": 0.30899080634117126, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 1060 + }, + { + "epoch": 0.7994023160254016, + "grad_norm": 0.3160453140735626, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 1070 + }, + { + "epoch": 0.8068733657078819, + "grad_norm": 0.30947187542915344, + "learning_rate": 0.0002, + "loss": 1.8452, + "step": 1080 + }, + { + "epoch": 0.8143444153903624, + "grad_norm": 0.3103134036064148, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1090 + }, + { + "epoch": 0.8218154650728428, + "grad_norm": 0.31771138310432434, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 1100 + }, + { + "epoch": 0.8292865147553231, + "grad_norm": 0.5860997438430786, + "learning_rate": 0.0002, + "loss": 1.7918, + "step": 1110 + }, + { + "epoch": 0.8367575644378035, + "grad_norm": 0.3230148255825043, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 1120 + }, + { + "epoch": 0.8442286141202839, + "grad_norm": 0.29611510038375854, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 1130 + }, + { + "epoch": 0.8516996638027643, + "grad_norm": 0.3373654782772064, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 1140 + }, + { + "epoch": 0.8591707134852447, + "grad_norm": 0.3474279046058655, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1150 + }, + { + "epoch": 0.866641763167725, + "grad_norm": 0.35057875514030457, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1160 + }, + { + "epoch": 0.8741128128502055, + "grad_norm": 0.39537495374679565, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 1170 + }, + { + "epoch": 0.8815838625326858, + "grad_norm": 0.3714233636856079, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1180 + }, + { + "epoch": 0.8890549122151662, + "grad_norm": 0.2950296998023987, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1190 + }, + { + "epoch": 0.8965259618976467, + "grad_norm": 0.38182979822158813, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 1200 + }, + { + "epoch": 0.903997011580127, + "grad_norm": 0.27883678674697876, + "learning_rate": 0.0002, + "loss": 1.827, + "step": 1210 + }, + { + "epoch": 0.9114680612626074, + "grad_norm": 0.33874374628067017, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1220 + }, + { + "epoch": 0.9189391109450877, + "grad_norm": 0.3014272153377533, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1230 + }, + { + "epoch": 0.9264101606275682, + "grad_norm": 0.3194271922111511, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 1240 + }, + { + "epoch": 0.9338812103100486, + "grad_norm": 0.3049403429031372, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1250 + }, + { + "epoch": 0.9413522599925289, + "grad_norm": 0.30621254444122314, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 1260 + }, + { + "epoch": 0.9488233096750094, + "grad_norm": 0.28675132989883423, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 1270 + }, + { + "epoch": 0.9562943593574897, + "grad_norm": 0.3322032690048218, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1280 + }, + { + "epoch": 0.9637654090399701, + "grad_norm": 0.35408294200897217, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1290 + }, + { + "epoch": 0.9712364587224505, + "grad_norm": 0.36386919021606445, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1300 + }, + { + "epoch": 0.9787075084049309, + "grad_norm": 0.32338324189186096, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 1310 + }, + { + "epoch": 0.9861785580874113, + "grad_norm": 0.3714013993740082, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 1320 + }, + { + "epoch": 0.9936496077698916, + "grad_norm": 0.3133082389831543, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 1330 + }, + { + "epoch": 0.9996264475158759, + "eval_loss": 1.8051470518112183, + "eval_runtime": 38.6332, + "eval_samples_per_second": 13.331, + "eval_steps_per_second": 1.682, + "step": 1338 + }, + { + "epoch": 1.001120657452372, + "grad_norm": 0.31595754623413086, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 1340 + }, + { + "epoch": 1.0085917071348525, + "grad_norm": 0.3095700144767761, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1350 + }, + { + "epoch": 1.0160627568173328, + "grad_norm": 0.34677496552467346, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1360 + }, + { + "epoch": 1.0235338064998132, + "grad_norm": 0.29108840227127075, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1370 + }, + { + "epoch": 1.0310048561822935, + "grad_norm": 0.32356950640678406, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 1380 + }, + { + "epoch": 1.038475905864774, + "grad_norm": 0.4200669229030609, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1390 + }, + { + "epoch": 1.0459469555472545, + "grad_norm": 0.3283711373806, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 1400 + }, + { + "epoch": 1.0534180052297348, + "grad_norm": 0.32898256182670593, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1410 + }, + { + "epoch": 1.0608890549122152, + "grad_norm": 0.38790300488471985, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 1420 + }, + { + "epoch": 1.0683601045946955, + "grad_norm": 0.339800089597702, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1430 + }, + { + "epoch": 1.075831154277176, + "grad_norm": 0.3548751175403595, + "learning_rate": 0.0002, + "loss": 1.7076, + "step": 1440 + }, + { + "epoch": 1.0833022039596563, + "grad_norm": 0.35114359855651855, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1450 + }, + { + "epoch": 1.0907732536421366, + "grad_norm": 0.35226720571517944, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 1460 + }, + { + "epoch": 1.0982443033246172, + "grad_norm": 0.33665576577186584, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 1470 + }, + { + "epoch": 1.1057153530070976, + "grad_norm": 0.363889217376709, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1480 + }, + { + "epoch": 1.113186402689578, + "grad_norm": 0.3826201856136322, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 1490 + }, + { + "epoch": 1.1206574523720583, + "grad_norm": 0.34058740735054016, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 1500 + }, + { + "epoch": 1.1281285020545386, + "grad_norm": 0.3462134301662445, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1510 + }, + { + "epoch": 1.135599551737019, + "grad_norm": 0.3396756052970886, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 1520 + }, + { + "epoch": 1.1430706014194993, + "grad_norm": 0.32004743814468384, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1530 + }, + { + "epoch": 1.15054165110198, + "grad_norm": 0.3397733271121979, + "learning_rate": 0.0002, + "loss": 1.743, + "step": 1540 + }, + { + "epoch": 1.1580127007844603, + "grad_norm": 0.3783262073993683, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 1550 + }, + { + "epoch": 1.1654837504669406, + "grad_norm": 0.35121291875839233, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1560 + }, + { + "epoch": 1.172954800149421, + "grad_norm": 0.35816895961761475, + "learning_rate": 0.0002, + "loss": 1.678, + "step": 1570 + }, + { + "epoch": 1.1804258498319014, + "grad_norm": 0.33843839168548584, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1580 + }, + { + "epoch": 1.1878968995143817, + "grad_norm": 0.3371972143650055, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 1590 + }, + { + "epoch": 1.195367949196862, + "grad_norm": 0.36016878485679626, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 1600 + }, + { + "epoch": 1.2028389988793426, + "grad_norm": 0.40879473090171814, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 1610 + }, + { + "epoch": 1.210310048561823, + "grad_norm": 0.3216715455055237, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 1620 + }, + { + "epoch": 1.2177810982443034, + "grad_norm": 0.4482610821723938, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1630 + }, + { + "epoch": 1.2252521479267837, + "grad_norm": 0.3257700502872467, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1640 + }, + { + "epoch": 1.232723197609264, + "grad_norm": 0.38646459579467773, + "learning_rate": 0.0002, + "loss": 1.7177, + "step": 1650 + }, + { + "epoch": 1.2401942472917444, + "grad_norm": 0.4081360697746277, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1660 + }, + { + "epoch": 1.2476652969742248, + "grad_norm": 0.4326848089694977, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 1670 + }, + { + "epoch": 1.2551363466567054, + "grad_norm": 0.346401572227478, + "learning_rate": 0.0002, + "loss": 1.6752, + "step": 1680 + }, + { + "epoch": 1.2626073963391857, + "grad_norm": 0.34536251425743103, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1690 + }, + { + "epoch": 1.270078446021666, + "grad_norm": 0.41359591484069824, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 1700 + }, + { + "epoch": 1.2775494957041464, + "grad_norm": 0.3530874252319336, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 1710 + }, + { + "epoch": 1.2850205453866268, + "grad_norm": 0.3702719211578369, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 1720 + }, + { + "epoch": 1.2924915950691072, + "grad_norm": 0.3703329563140869, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1730 + }, + { + "epoch": 1.2999626447515875, + "grad_norm": 0.37919729948043823, + "learning_rate": 0.0002, + "loss": 1.7221, + "step": 1740 + }, + { + "epoch": 1.307433694434068, + "grad_norm": 0.32526856660842896, + "learning_rate": 0.0002, + "loss": 1.7859, + "step": 1750 + }, + { + "epoch": 1.3149047441165485, + "grad_norm": 0.36752620339393616, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1760 + }, + { + "epoch": 1.3223757937990288, + "grad_norm": 0.3398192524909973, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1770 + }, + { + "epoch": 1.3298468434815092, + "grad_norm": 0.37435585260391235, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1780 + }, + { + "epoch": 1.3373178931639895, + "grad_norm": 0.35793280601501465, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1790 + }, + { + "epoch": 1.3447889428464699, + "grad_norm": 0.35481882095336914, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1800 + }, + { + "epoch": 1.3522599925289502, + "grad_norm": 0.3786393105983734, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1810 + }, + { + "epoch": 1.3597310422114308, + "grad_norm": 0.33245593309402466, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1820 + }, + { + "epoch": 1.3672020918939112, + "grad_norm": 0.35388344526290894, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1830 + }, + { + "epoch": 1.3746731415763915, + "grad_norm": 0.3695325553417206, + "learning_rate": 0.0002, + "loss": 1.6968, + "step": 1840 + }, + { + "epoch": 1.382144191258872, + "grad_norm": 0.3683604598045349, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1850 + }, + { + "epoch": 1.3896152409413522, + "grad_norm": 0.3753012418746948, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1860 + }, + { + "epoch": 1.3970862906238326, + "grad_norm": 0.3331069350242615, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 1870 + }, + { + "epoch": 1.404557340306313, + "grad_norm": 0.3877500295639038, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 1880 + }, + { + "epoch": 1.4120283899887935, + "grad_norm": 0.33525151014328003, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1890 + }, + { + "epoch": 1.4194994396712737, + "grad_norm": 0.3697299659252167, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1900 + }, + { + "epoch": 1.4269704893537543, + "grad_norm": 0.4029286205768585, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1910 + }, + { + "epoch": 1.4344415390362346, + "grad_norm": 0.3596203029155731, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 1920 + }, + { + "epoch": 1.441912588718715, + "grad_norm": 0.450783908367157, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 1930 + }, + { + "epoch": 1.4493836384011953, + "grad_norm": 0.3651481866836548, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1940 + }, + { + "epoch": 1.4568546880836757, + "grad_norm": 0.3608424663543701, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 1950 + }, + { + "epoch": 1.4643257377661563, + "grad_norm": 0.39684420824050903, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 1960 + }, + { + "epoch": 1.4717967874486364, + "grad_norm": 0.34618663787841797, + "learning_rate": 0.0002, + "loss": 1.7514, + "step": 1970 + }, + { + "epoch": 1.479267837131117, + "grad_norm": 0.4150386452674866, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1980 + }, + { + "epoch": 1.4867388868135973, + "grad_norm": 0.35500776767730713, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1990 + }, + { + "epoch": 1.4942099364960777, + "grad_norm": 0.344144344329834, + "learning_rate": 0.0002, + "loss": 1.7322, + "step": 2000 + }, + { + "epoch": 1.501680986178558, + "grad_norm": 0.3340149223804474, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2010 + }, + { + "epoch": 1.5091520358610384, + "grad_norm": 0.37685006856918335, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 2020 + }, + { + "epoch": 1.516623085543519, + "grad_norm": 0.3699876368045807, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 2030 + }, + { + "epoch": 1.5240941352259991, + "grad_norm": 0.3370307385921478, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 2040 + }, + { + "epoch": 1.5315651849084797, + "grad_norm": 0.37780630588531494, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 2050 + }, + { + "epoch": 1.53903623459096, + "grad_norm": 0.370259165763855, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 2060 + }, + { + "epoch": 1.5465072842734404, + "grad_norm": 0.3440011441707611, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 2070 + }, + { + "epoch": 1.5539783339559208, + "grad_norm": 0.40382063388824463, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 2080 + }, + { + "epoch": 1.5614493836384011, + "grad_norm": 0.38002029061317444, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 2090 + }, + { + "epoch": 1.5689204333208817, + "grad_norm": 0.3658451437950134, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2100 + }, + { + "epoch": 1.5763914830033618, + "grad_norm": 0.354842871427536, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 2110 + }, + { + "epoch": 1.5838625326858424, + "grad_norm": 0.34735530614852905, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 2120 + }, + { + "epoch": 1.5913335823683228, + "grad_norm": 0.377581924200058, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 2130 + }, + { + "epoch": 1.5988046320508031, + "grad_norm": 0.41254034638404846, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 2140 + }, + { + "epoch": 1.6062756817332835, + "grad_norm": 0.3630715310573578, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2150 + }, + { + "epoch": 1.6137467314157639, + "grad_norm": 0.36980143189430237, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 2160 + }, + { + "epoch": 1.6212177810982444, + "grad_norm": 0.3634769320487976, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2170 + }, + { + "epoch": 1.6286888307807246, + "grad_norm": 0.3794139623641968, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2180 + }, + { + "epoch": 1.6361598804632052, + "grad_norm": 0.359742134809494, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 2190 + }, + { + "epoch": 1.6436309301456855, + "grad_norm": 0.3770543932914734, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.6511019798281659, + "grad_norm": 0.3797036409378052, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 2210 + }, + { + "epoch": 1.6585730295106462, + "grad_norm": 0.35622093081474304, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 2220 + }, + { + "epoch": 1.6660440791931266, + "grad_norm": 0.34552520513534546, + "learning_rate": 0.0002, + "loss": 1.6615, + "step": 2230 + }, + { + "epoch": 1.6735151288756072, + "grad_norm": 0.379926860332489, + "learning_rate": 0.0002, + "loss": 1.7522, + "step": 2240 + }, + { + "epoch": 1.6809861785580873, + "grad_norm": 0.37083810567855835, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 2250 + }, + { + "epoch": 1.6884572282405679, + "grad_norm": 0.42746543884277344, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 2260 + }, + { + "epoch": 1.6959282779230482, + "grad_norm": 0.3372884690761566, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2270 + }, + { + "epoch": 1.7033993276055286, + "grad_norm": 0.35220256447792053, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2280 + }, + { + "epoch": 1.710870377288009, + "grad_norm": 0.3659130930900574, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 2290 + }, + { + "epoch": 1.7183414269704893, + "grad_norm": 0.37629297375679016, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2300 + }, + { + "epoch": 1.7258124766529699, + "grad_norm": 0.36312398314476013, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2310 + }, + { + "epoch": 1.73328352633545, + "grad_norm": 0.467709481716156, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 2320 + }, + { + "epoch": 1.7407545760179306, + "grad_norm": 0.38685527443885803, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2330 + }, + { + "epoch": 1.748225625700411, + "grad_norm": 0.3578338325023651, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 2340 + }, + { + "epoch": 1.7556966753828913, + "grad_norm": 0.36057502031326294, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2350 + }, + { + "epoch": 1.7631677250653717, + "grad_norm": 0.3615196645259857, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2360 + }, + { + "epoch": 1.770638774747852, + "grad_norm": 0.4118947684764862, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 2370 + }, + { + "epoch": 1.7781098244303326, + "grad_norm": 0.4067276120185852, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2380 + }, + { + "epoch": 1.7855808741128127, + "grad_norm": 0.3979823887348175, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2390 + }, + { + "epoch": 1.7930519237952933, + "grad_norm": 0.44045883417129517, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 2400 + }, + { + "epoch": 1.8005229734777737, + "grad_norm": 0.3998069167137146, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 2410 + }, + { + "epoch": 1.807994023160254, + "grad_norm": 0.3450094759464264, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 2420 + }, + { + "epoch": 1.8154650728427344, + "grad_norm": 0.3759009838104248, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2430 + }, + { + "epoch": 1.8229361225252148, + "grad_norm": 0.34347015619277954, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2440 + }, + { + "epoch": 1.8304071722076953, + "grad_norm": 0.3511228859424591, + "learning_rate": 0.0002, + "loss": 1.7345, + "step": 2450 + }, + { + "epoch": 1.8378782218901755, + "grad_norm": 0.36853715777397156, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 2460 + }, + { + "epoch": 1.845349271572656, + "grad_norm": 0.40659376978874207, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2470 + }, + { + "epoch": 1.8528203212551362, + "grad_norm": 0.39621320366859436, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 2480 + }, + { + "epoch": 1.8602913709376168, + "grad_norm": 0.3753979504108429, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 2490 + }, + { + "epoch": 1.8677624206200971, + "grad_norm": 0.3811938464641571, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2500 + }, + { + "epoch": 1.8752334703025775, + "grad_norm": 0.3432596027851105, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 2510 + }, + { + "epoch": 1.882704519985058, + "grad_norm": 0.3670712113380432, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 2520 + }, + { + "epoch": 1.8901755696675382, + "grad_norm": 0.40907177329063416, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2530 + }, + { + "epoch": 1.8976466193500188, + "grad_norm": 0.3821999728679657, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 2540 + }, + { + "epoch": 1.905117669032499, + "grad_norm": 0.36173978447914124, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 2550 + }, + { + "epoch": 1.9125887187149795, + "grad_norm": 0.38990336656570435, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 2560 + }, + { + "epoch": 1.9200597683974598, + "grad_norm": 0.35242322087287903, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 2570 + }, + { + "epoch": 1.9275308180799402, + "grad_norm": 0.3506428003311157, + "learning_rate": 0.0002, + "loss": 1.7268, + "step": 2580 + }, + { + "epoch": 1.9350018677624208, + "grad_norm": 0.39540135860443115, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2590 + }, + { + "epoch": 1.942472917444901, + "grad_norm": 0.3444725573062897, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2600 + }, + { + "epoch": 1.9499439671273815, + "grad_norm": 0.3963521718978882, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 2610 + }, + { + "epoch": 1.9574150168098616, + "grad_norm": 0.3689815402030945, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2620 + }, + { + "epoch": 1.9648860664923422, + "grad_norm": 0.3482626676559448, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 2630 + }, + { + "epoch": 1.9723571161748226, + "grad_norm": 0.35832616686820984, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2640 + }, + { + "epoch": 1.979828165857303, + "grad_norm": 0.4776208996772766, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2650 + }, + { + "epoch": 1.9872992155397835, + "grad_norm": 0.32570165395736694, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2660 + }, + { + "epoch": 1.9947702652222636, + "grad_norm": 0.3380725085735321, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2670 + }, + { + "epoch": 2.0, + "eval_loss": 1.8046749830245972, + "eval_runtime": 38.5096, + "eval_samples_per_second": 13.373, + "eval_steps_per_second": 1.688, + "step": 2677 + }, + { + "epoch": 2.002241314904744, + "grad_norm": 0.36817631125450134, + "learning_rate": 0.0002, + "loss": 1.7265, + "step": 2680 + }, + { + "epoch": 2.0097123645872244, + "grad_norm": 0.4056456685066223, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2690 + }, + { + "epoch": 2.017183414269705, + "grad_norm": 0.37416863441467285, + "learning_rate": 0.0002, + "loss": 1.5515, + "step": 2700 + }, + { + "epoch": 2.024654463952185, + "grad_norm": 0.4273638427257538, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2710 + }, + { + "epoch": 2.0321255136346656, + "grad_norm": 0.36497923731803894, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2720 + }, + { + "epoch": 2.0395965633171462, + "grad_norm": 0.5021994113922119, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 2730 + }, + { + "epoch": 2.0470676129996264, + "grad_norm": 0.45896220207214355, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 2740 + }, + { + "epoch": 2.054538662682107, + "grad_norm": 0.3973815143108368, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 2750 + }, + { + "epoch": 2.062009712364587, + "grad_norm": 0.4521815776824951, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2760 + }, + { + "epoch": 2.0694807620470677, + "grad_norm": 0.42775002121925354, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2770 + }, + { + "epoch": 2.076951811729548, + "grad_norm": 0.48158586025238037, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 2780 + }, + { + "epoch": 2.0844228614120284, + "grad_norm": 0.4612371623516083, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2790 + }, + { + "epoch": 2.091893911094509, + "grad_norm": 0.42536866664886475, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 2800 + }, + { + "epoch": 2.099364960776989, + "grad_norm": 0.48515772819519043, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 2810 + }, + { + "epoch": 2.1068360104594697, + "grad_norm": 0.41418662667274475, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2820 + }, + { + "epoch": 2.11430706014195, + "grad_norm": 0.4683697819709778, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2830 + }, + { + "epoch": 2.1217781098244304, + "grad_norm": 0.4484657049179077, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2840 + }, + { + "epoch": 2.1292491595069105, + "grad_norm": 0.6621400713920593, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 2850 + }, + { + "epoch": 2.136720209189391, + "grad_norm": 0.45074811577796936, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 2860 + }, + { + "epoch": 2.1441912588718717, + "grad_norm": 0.3513113558292389, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2870 + }, + { + "epoch": 2.151662308554352, + "grad_norm": 0.40411314368247986, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 2880 + }, + { + "epoch": 2.1591333582368324, + "grad_norm": 0.4121065139770508, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 2890 + }, + { + "epoch": 2.1666044079193125, + "grad_norm": 0.44723689556121826, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 2900 + }, + { + "epoch": 2.174075457601793, + "grad_norm": 0.4226122498512268, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 2910 + }, + { + "epoch": 2.1815465072842732, + "grad_norm": 0.46617650985717773, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2920 + }, + { + "epoch": 2.189017556966754, + "grad_norm": 0.4506422281265259, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 2930 + }, + { + "epoch": 2.1964886066492344, + "grad_norm": 0.4892672896385193, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2940 + }, + { + "epoch": 2.2039596563317145, + "grad_norm": 0.44095516204833984, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2950 + }, + { + "epoch": 2.211430706014195, + "grad_norm": 0.41522109508514404, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 2960 + }, + { + "epoch": 2.2189017556966752, + "grad_norm": 0.4860858917236328, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2970 + }, + { + "epoch": 2.226372805379156, + "grad_norm": 0.42662516236305237, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2980 + }, + { + "epoch": 2.233843855061636, + "grad_norm": 0.4390648305416107, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2990 + }, + { + "epoch": 2.2413149047441165, + "grad_norm": 0.47515565156936646, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 3000 + }, + { + "epoch": 2.248785954426597, + "grad_norm": 0.4104543924331665, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 3010 + }, + { + "epoch": 2.2562570041090773, + "grad_norm": 0.4404028654098511, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 3020 + }, + { + "epoch": 2.263728053791558, + "grad_norm": 0.4717366695404053, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3030 + }, + { + "epoch": 2.271199103474038, + "grad_norm": 0.48345857858657837, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 3040 + }, + { + "epoch": 2.2786701531565186, + "grad_norm": 0.5312452912330627, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 3050 + }, + { + "epoch": 2.2861412028389987, + "grad_norm": 0.5073099732398987, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 3060 + }, + { + "epoch": 2.2936122525214793, + "grad_norm": 0.5027463436126709, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 3070 + }, + { + "epoch": 2.30108330220396, + "grad_norm": 0.5436304807662964, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3080 + }, + { + "epoch": 2.30855435188644, + "grad_norm": 0.4701065123081207, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 3090 + }, + { + "epoch": 2.3160254015689206, + "grad_norm": 0.46988746523857117, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 3100 + }, + { + "epoch": 2.3234964512514007, + "grad_norm": 0.45112869143486023, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 3110 + }, + { + "epoch": 2.3309675009338813, + "grad_norm": 0.5173566937446594, + "learning_rate": 0.0002, + "loss": 1.6291, + "step": 3120 + }, + { + "epoch": 2.3384385506163614, + "grad_norm": 0.40345850586891174, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 3130 + }, + { + "epoch": 2.345909600298842, + "grad_norm": 0.4218924939632416, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3140 + }, + { + "epoch": 2.3533806499813226, + "grad_norm": 0.41857317090034485, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 3150 + }, + { + "epoch": 2.3608516996638027, + "grad_norm": 0.4197218418121338, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 3160 + }, + { + "epoch": 2.3683227493462833, + "grad_norm": 0.4260677397251129, + "learning_rate": 0.0002, + "loss": 1.6572, + "step": 3170 + }, + { + "epoch": 2.3757937990287634, + "grad_norm": 0.4209042191505432, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3180 + }, + { + "epoch": 2.383264848711244, + "grad_norm": 0.4092234969139099, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3190 + }, + { + "epoch": 2.390735898393724, + "grad_norm": 0.4928431510925293, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 3200 + }, + { + "epoch": 2.3982069480762047, + "grad_norm": 0.49252402782440186, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3210 + }, + { + "epoch": 2.4056779977586853, + "grad_norm": 0.4368397295475006, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3220 + }, + { + "epoch": 2.4131490474411654, + "grad_norm": 0.46122390031814575, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 3230 + }, + { + "epoch": 2.420620097123646, + "grad_norm": 0.4272301197052002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.428091146806126, + "grad_norm": 0.41480937600135803, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 3250 + }, + { + "epoch": 2.4355621964886067, + "grad_norm": 0.48911941051483154, + "learning_rate": 0.0002, + "loss": 1.6281, + "step": 3260 + }, + { + "epoch": 2.443033246171087, + "grad_norm": 0.4444098472595215, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 3270 + }, + { + "epoch": 2.4505042958535674, + "grad_norm": 0.5111684799194336, + "learning_rate": 0.0002, + "loss": 1.6961, + "step": 3280 + }, + { + "epoch": 2.457975345536048, + "grad_norm": 0.5058825016021729, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 3290 + }, + { + "epoch": 2.465446395218528, + "grad_norm": 0.44173210859298706, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3300 + }, + { + "epoch": 2.4729174449010087, + "grad_norm": 0.4659745991230011, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 3310 + }, + { + "epoch": 2.480388494583489, + "grad_norm": 0.47237497568130493, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3320 + }, + { + "epoch": 2.4878595442659694, + "grad_norm": 0.47303131222724915, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 3330 + }, + { + "epoch": 2.4953305939484496, + "grad_norm": 0.4522389769554138, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 3340 + }, + { + "epoch": 2.50280164363093, + "grad_norm": 0.4467332363128662, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3350 + }, + { + "epoch": 2.5102726933134107, + "grad_norm": 0.4413762092590332, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3360 + }, + { + "epoch": 2.517743742995891, + "grad_norm": 0.495514452457428, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 3370 + }, + { + "epoch": 2.5252147926783715, + "grad_norm": 0.4429773986339569, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 3380 + }, + { + "epoch": 2.5326858423608516, + "grad_norm": 0.4589079022407532, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3390 + }, + { + "epoch": 2.540156892043332, + "grad_norm": 0.4683997333049774, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 3400 + }, + { + "epoch": 2.5476279417258123, + "grad_norm": 0.4651731252670288, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 3410 + }, + { + "epoch": 2.555098991408293, + "grad_norm": 0.45818084478378296, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3420 + }, + { + "epoch": 2.5625700410907735, + "grad_norm": 0.45209529995918274, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.5700410907732536, + "grad_norm": 0.4344733655452728, + "learning_rate": 0.0002, + "loss": 1.5606, + "step": 3440 + }, + { + "epoch": 2.577512140455734, + "grad_norm": 0.47435566782951355, + "learning_rate": 0.0002, + "loss": 1.6748, + "step": 3450 + }, + { + "epoch": 2.5849831901382143, + "grad_norm": 0.43841999769210815, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 3460 + }, + { + "epoch": 2.592454239820695, + "grad_norm": 0.4323869049549103, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 3470 + }, + { + "epoch": 2.599925289503175, + "grad_norm": 0.44355881214141846, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 3480 + }, + { + "epoch": 2.6073963391856556, + "grad_norm": 0.45847779512405396, + "learning_rate": 0.0002, + "loss": 1.665, + "step": 3490 + }, + { + "epoch": 2.614867388868136, + "grad_norm": 0.4411061704158783, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3500 + }, + { + "epoch": 2.6223384385506163, + "grad_norm": 0.4446796178817749, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3510 + }, + { + "epoch": 2.629809488233097, + "grad_norm": 0.41969653964042664, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3520 + }, + { + "epoch": 2.637280537915577, + "grad_norm": 0.5263747572898865, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 3530 + }, + { + "epoch": 2.6447515875980576, + "grad_norm": 0.47719451785087585, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3540 + }, + { + "epoch": 2.6522226372805378, + "grad_norm": 0.46574118733406067, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 3550 + }, + { + "epoch": 2.6596936869630183, + "grad_norm": 0.46867135167121887, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 3560 + }, + { + "epoch": 2.667164736645499, + "grad_norm": 0.4441198706626892, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3570 + }, + { + "epoch": 2.674635786327979, + "grad_norm": 0.4871319830417633, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3580 + }, + { + "epoch": 2.6821068360104596, + "grad_norm": 0.43900373578071594, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 3590 + }, + { + "epoch": 2.6895778856929398, + "grad_norm": 0.42509549856185913, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 3600 + }, + { + "epoch": 2.6970489353754203, + "grad_norm": 0.4691086709499359, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 3610 + }, + { + "epoch": 2.7045199850579005, + "grad_norm": 0.46318942308425903, + "learning_rate": 0.0002, + "loss": 1.5491, + "step": 3620 + }, + { + "epoch": 2.711991034740381, + "grad_norm": 0.44631096720695496, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3630 + }, + { + "epoch": 2.7194620844228616, + "grad_norm": 0.42315489053726196, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3640 + }, + { + "epoch": 2.7269331341053418, + "grad_norm": 0.4971241056919098, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3650 + }, + { + "epoch": 2.7344041837878224, + "grad_norm": 0.4578486382961273, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 3660 + }, + { + "epoch": 2.7418752334703025, + "grad_norm": 0.46584776043891907, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3670 + }, + { + "epoch": 2.749346283152783, + "grad_norm": 0.4951731264591217, + "learning_rate": 0.0002, + "loss": 1.6809, + "step": 3680 + }, + { + "epoch": 2.756817332835263, + "grad_norm": 0.4935225546360016, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 3690 + }, + { + "epoch": 2.764288382517744, + "grad_norm": 0.41805586218833923, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3700 + }, + { + "epoch": 2.7717594322002244, + "grad_norm": 0.4417555630207062, + "learning_rate": 0.0002, + "loss": 1.7173, + "step": 3710 + }, + { + "epoch": 2.7792304818827045, + "grad_norm": 0.48229655623435974, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 3720 + }, + { + "epoch": 2.786701531565185, + "grad_norm": 0.48562315106391907, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3730 + }, + { + "epoch": 2.794172581247665, + "grad_norm": 0.4473940432071686, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 3740 + }, + { + "epoch": 2.801643630930146, + "grad_norm": 0.4626813232898712, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3750 + }, + { + "epoch": 2.809114680612626, + "grad_norm": 0.4339792728424072, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 3760 + }, + { + "epoch": 2.8165857302951065, + "grad_norm": 0.5250858068466187, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 3770 + }, + { + "epoch": 2.824056779977587, + "grad_norm": 0.4537523090839386, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3780 + }, + { + "epoch": 2.831527829660067, + "grad_norm": 0.5646113157272339, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3790 + }, + { + "epoch": 2.8389988793425474, + "grad_norm": 0.44243332743644714, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 3800 + }, + { + "epoch": 2.846469929025028, + "grad_norm": 0.4585791826248169, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3810 + }, + { + "epoch": 2.8539409787075085, + "grad_norm": 0.489702045917511, + "learning_rate": 0.0002, + "loss": 1.6854, + "step": 3820 + }, + { + "epoch": 2.8614120283899886, + "grad_norm": 0.502470850944519, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 3830 + }, + { + "epoch": 2.8688830780724692, + "grad_norm": 0.4395960867404938, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3840 + }, + { + "epoch": 2.87635412775495, + "grad_norm": 0.4348670244216919, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3850 + }, + { + "epoch": 2.88382517743743, + "grad_norm": 0.48852720856666565, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3860 + }, + { + "epoch": 2.89129622711991, + "grad_norm": 0.45317450165748596, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3870 + }, + { + "epoch": 2.8987672768023907, + "grad_norm": 0.4732758700847626, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3880 + }, + { + "epoch": 2.9062383264848712, + "grad_norm": 0.45238012075424194, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3890 + }, + { + "epoch": 2.9137093761673514, + "grad_norm": 0.48838064074516296, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 3900 + }, + { + "epoch": 2.921180425849832, + "grad_norm": 0.43496349453926086, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 3910 + }, + { + "epoch": 2.9286514755323125, + "grad_norm": 0.47963935136795044, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 3920 + }, + { + "epoch": 2.9361225252147927, + "grad_norm": 0.4544987976551056, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 3930 + }, + { + "epoch": 2.943593574897273, + "grad_norm": 0.4622892141342163, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 3940 + }, + { + "epoch": 2.9510646245797534, + "grad_norm": 0.47026222944259644, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 3950 + }, + { + "epoch": 2.958535674262234, + "grad_norm": 0.4549552798271179, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 3960 + }, + { + "epoch": 2.966006723944714, + "grad_norm": 0.46647515892982483, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3970 + }, + { + "epoch": 2.9734777736271947, + "grad_norm": 0.45095112919807434, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 3980 + }, + { + "epoch": 2.9809488233096753, + "grad_norm": 0.4690017104148865, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 3990 + }, + { + "epoch": 2.9884198729921554, + "grad_norm": 0.4603444039821625, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 4000 + }, + { + "epoch": 2.9958909226746355, + "grad_norm": 0.4743294417858124, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 4010 + }, + { + "epoch": 2.999626447515876, + "eval_loss": 1.8252571821212769, + "eval_runtime": 38.7853, + "eval_samples_per_second": 13.278, + "eval_steps_per_second": 1.676, + "step": 4015 + } + ], + "logging_steps": 10, + "max_steps": 10704, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8582834893134234e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-4015/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d6dd4e080a63c8ce12c05a3186c0385567b67691 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:023bb3375e82678547f483aeed88017e5e4cb836f95329c3807e2116a94569ef +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e8c7589fe0246488eec927afd0ad93330197dc0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6f5d437093ba4fd926c6f2b9af844b0db46d5cffed16d995a7746d6caca17f3 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f1f3818fcabdac36ea21169cde6edf57becaba19 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81343a5862e20c6075e31e0516d064e185be4f7553555b02fb5c78ef8c752e8b +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..46ed1c9e06360fb5de0d44678d86c87349076cfb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:943c28f99829ff3d008d59ce446faf88e17ac41806214c04b0d404cc44482013 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0fa16c3ae3c5f91f2caed42992c6dc6b9cb0f148 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/trainer_state.json @@ -0,0 +1,3810 @@ +{ + "best_metric": 1.8046749830245972, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 5354, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007471049682480389, + "grad_norm": 0.4912872612476349, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 10 + }, + { + "epoch": 0.014942099364960777, + "grad_norm": 0.4856316149234772, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 20 + }, + { + "epoch": 0.022413149047441166, + "grad_norm": 0.47683125734329224, + "learning_rate": 0.0002, + "loss": 2.0957, + "step": 30 + }, + { + "epoch": 0.029884198729921554, + "grad_norm": 0.515082597732544, + "learning_rate": 0.0002, + "loss": 1.8908, + "step": 40 + }, + { + "epoch": 0.03735524841240194, + "grad_norm": 0.5299215316772461, + "learning_rate": 0.0002, + "loss": 1.9704, + "step": 50 + }, + { + "epoch": 0.04482629809488233, + "grad_norm": 0.4951399862766266, + "learning_rate": 0.0002, + "loss": 1.9225, + "step": 60 + }, + { + "epoch": 0.05229734777736272, + "grad_norm": 0.48079821467399597, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05976839745984311, + "grad_norm": 0.49402132630348206, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 80 + }, + { + "epoch": 0.0672394471423235, + "grad_norm": 0.4778193235397339, + "learning_rate": 0.0002, + "loss": 1.8691, + "step": 90 + }, + { + "epoch": 0.07471049682480388, + "grad_norm": 0.42472657561302185, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 100 + }, + { + "epoch": 0.08218154650728428, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0002, + "loss": 1.8744, + "step": 110 + }, + { + "epoch": 0.08965259618976466, + "grad_norm": 0.4472862780094147, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 120 + }, + { + "epoch": 0.09712364587224505, + "grad_norm": 0.42596298456192017, + "learning_rate": 0.0002, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.10459469555472543, + "grad_norm": 0.46645811200141907, + "learning_rate": 0.0002, + "loss": 1.8015, + "step": 140 + }, + { + "epoch": 0.11206574523720583, + "grad_norm": 0.41041234135627747, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 150 + }, + { + "epoch": 0.11953679491968622, + "grad_norm": 0.5329819917678833, + "learning_rate": 0.0002, + "loss": 1.8276, + "step": 160 + }, + { + "epoch": 0.1270078446021666, + "grad_norm": 0.4065922200679779, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 170 + }, + { + "epoch": 0.134478894284647, + "grad_norm": 0.38406994938850403, + "learning_rate": 0.0002, + "loss": 1.8559, + "step": 180 + }, + { + "epoch": 0.14194994396712737, + "grad_norm": 0.4246881306171417, + "learning_rate": 0.0002, + "loss": 1.8647, + "step": 190 + }, + { + "epoch": 0.14942099364960776, + "grad_norm": 0.35136649012565613, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 200 + }, + { + "epoch": 0.15689204333208817, + "grad_norm": 0.43252742290496826, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.16436309301456856, + "grad_norm": 0.39236941933631897, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 220 + }, + { + "epoch": 0.17183414269704894, + "grad_norm": 0.3748249113559723, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 230 + }, + { + "epoch": 0.17930519237952933, + "grad_norm": 0.6432855725288391, + "learning_rate": 0.0002, + "loss": 1.866, + "step": 240 + }, + { + "epoch": 0.1867762420620097, + "grad_norm": 0.34874802827835083, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 250 + }, + { + "epoch": 0.1942472917444901, + "grad_norm": 0.3721984326839447, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 260 + }, + { + "epoch": 0.20171834142697048, + "grad_norm": 0.4339311420917511, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 270 + }, + { + "epoch": 0.20918939110945087, + "grad_norm": 0.4018215537071228, + "learning_rate": 0.0002, + "loss": 1.8665, + "step": 280 + }, + { + "epoch": 0.21666044079193125, + "grad_norm": 0.3278839886188507, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 290 + }, + { + "epoch": 0.22413149047441167, + "grad_norm": 0.36146077513694763, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 300 + }, + { + "epoch": 0.23160254015689205, + "grad_norm": 0.38175010681152344, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 310 + }, + { + "epoch": 0.23907358983937244, + "grad_norm": 0.44776618480682373, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.24654463952185282, + "grad_norm": 0.3933652937412262, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 330 + }, + { + "epoch": 0.2540156892043332, + "grad_norm": 0.3515005111694336, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 340 + }, + { + "epoch": 0.2614867388868136, + "grad_norm": 0.6683304309844971, + "learning_rate": 0.0002, + "loss": 1.8653, + "step": 350 + }, + { + "epoch": 0.268957788569294, + "grad_norm": 0.37093454599380493, + "learning_rate": 0.0002, + "loss": 1.8797, + "step": 360 + }, + { + "epoch": 0.2764288382517744, + "grad_norm": 0.3450651168823242, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 370 + }, + { + "epoch": 0.28389988793425475, + "grad_norm": 0.5140917301177979, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 380 + }, + { + "epoch": 0.29137093761673516, + "grad_norm": 0.32885563373565674, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 390 + }, + { + "epoch": 0.2988419872992155, + "grad_norm": 0.33962297439575195, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.30631303698169593, + "grad_norm": 0.3723141849040985, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 410 + }, + { + "epoch": 0.31378408666417634, + "grad_norm": 0.37173134088516235, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 420 + }, + { + "epoch": 0.3212551363466567, + "grad_norm": 0.33736956119537354, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 430 + }, + { + "epoch": 0.3287261860291371, + "grad_norm": 0.3602448105812073, + "learning_rate": 0.0002, + "loss": 1.8367, + "step": 440 + }, + { + "epoch": 0.33619723571161747, + "grad_norm": 0.3569699227809906, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 450 + }, + { + "epoch": 0.3436682853940979, + "grad_norm": 0.31009167432785034, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 460 + }, + { + "epoch": 0.35113933507657824, + "grad_norm": 0.5278693437576294, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 470 + }, + { + "epoch": 0.35861038475905865, + "grad_norm": 0.3587537109851837, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 480 + }, + { + "epoch": 0.366081434441539, + "grad_norm": 0.3859670162200928, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 490 + }, + { + "epoch": 0.3735524841240194, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 500 + }, + { + "epoch": 0.38102353380649984, + "grad_norm": 0.35052940249443054, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 510 + }, + { + "epoch": 0.3884945834889802, + "grad_norm": 0.2979494333267212, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 520 + }, + { + "epoch": 0.3959656331714606, + "grad_norm": 0.3062683343887329, + "learning_rate": 0.0002, + "loss": 1.8641, + "step": 530 + }, + { + "epoch": 0.40343668285394096, + "grad_norm": 0.3172847330570221, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 540 + }, + { + "epoch": 0.4109077325364214, + "grad_norm": 0.360435426235199, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 550 + }, + { + "epoch": 0.41837878221890173, + "grad_norm": 0.3427872359752655, + "learning_rate": 0.0002, + "loss": 1.9054, + "step": 560 + }, + { + "epoch": 0.42584983190138215, + "grad_norm": 0.34036558866500854, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 570 + }, + { + "epoch": 0.4333208815838625, + "grad_norm": 0.3365345299243927, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 580 + }, + { + "epoch": 0.4407919312663429, + "grad_norm": 0.35619041323661804, + "learning_rate": 0.0002, + "loss": 1.8328, + "step": 590 + }, + { + "epoch": 0.44826298094882333, + "grad_norm": 0.3569088280200958, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 600 + }, + { + "epoch": 0.4557340306313037, + "grad_norm": 0.3581278622150421, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 610 + }, + { + "epoch": 0.4632050803137841, + "grad_norm": 0.43197110295295715, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 620 + }, + { + "epoch": 0.47067612999626446, + "grad_norm": 0.33966198563575745, + "learning_rate": 0.0002, + "loss": 1.8257, + "step": 630 + }, + { + "epoch": 0.47814717967874487, + "grad_norm": 0.3343866467475891, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 640 + }, + { + "epoch": 0.48561822936122523, + "grad_norm": 0.33878564834594727, + "learning_rate": 0.0002, + "loss": 1.8191, + "step": 650 + }, + { + "epoch": 0.49308927904370564, + "grad_norm": 0.387195885181427, + "learning_rate": 0.0002, + "loss": 1.8801, + "step": 660 + }, + { + "epoch": 0.500560328726186, + "grad_norm": 0.3755440413951874, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 670 + }, + { + "epoch": 0.5080313784086664, + "grad_norm": 0.3272816836833954, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 680 + }, + { + "epoch": 0.5155024280911468, + "grad_norm": 0.36063864827156067, + "learning_rate": 0.0002, + "loss": 1.8156, + "step": 690 + }, + { + "epoch": 0.5229734777736272, + "grad_norm": 0.35317373275756836, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 700 + }, + { + "epoch": 0.5304445274561076, + "grad_norm": 0.3561195433139801, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 710 + }, + { + "epoch": 0.537915577138588, + "grad_norm": 0.31124624609947205, + "learning_rate": 0.0002, + "loss": 1.8149, + "step": 720 + }, + { + "epoch": 0.5453866268210683, + "grad_norm": 0.3294544517993927, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 730 + }, + { + "epoch": 0.5528576765035488, + "grad_norm": 0.31933900713920593, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 740 + }, + { + "epoch": 0.5603287261860291, + "grad_norm": 0.3226020634174347, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 750 + }, + { + "epoch": 0.5677997758685095, + "grad_norm": 0.3147525489330292, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 760 + }, + { + "epoch": 0.57527082555099, + "grad_norm": 0.32234328985214233, + "learning_rate": 0.0002, + "loss": 1.9028, + "step": 770 + }, + { + "epoch": 0.5827418752334703, + "grad_norm": 0.3258664309978485, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 780 + }, + { + "epoch": 0.5902129249159507, + "grad_norm": 0.3166961967945099, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 790 + }, + { + "epoch": 0.597683974598431, + "grad_norm": 0.35621458292007446, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 800 + }, + { + "epoch": 0.6051550242809115, + "grad_norm": 0.3236999213695526, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 810 + }, + { + "epoch": 0.6126260739633919, + "grad_norm": 0.2892923653125763, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 820 + }, + { + "epoch": 0.6200971236458722, + "grad_norm": 0.4098321497440338, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 830 + }, + { + "epoch": 0.6275681733283527, + "grad_norm": 0.3337118923664093, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 840 + }, + { + "epoch": 0.635039223010833, + "grad_norm": 0.30416029691696167, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 850 + }, + { + "epoch": 0.6425102726933134, + "grad_norm": 0.3361026346683502, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 860 + }, + { + "epoch": 0.6499813223757938, + "grad_norm": 0.3537365198135376, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 870 + }, + { + "epoch": 0.6574523720582742, + "grad_norm": 0.33854469656944275, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 880 + }, + { + "epoch": 0.6649234217407546, + "grad_norm": 0.3332272469997406, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 890 + }, + { + "epoch": 0.6723944714232349, + "grad_norm": 0.34954726696014404, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 900 + }, + { + "epoch": 0.6798655211057153, + "grad_norm": 0.2921750247478485, + "learning_rate": 0.0002, + "loss": 1.7917, + "step": 910 + }, + { + "epoch": 0.6873365707881958, + "grad_norm": 0.30508682131767273, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 920 + }, + { + "epoch": 0.6948076204706761, + "grad_norm": 0.32268425822257996, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 930 + }, + { + "epoch": 0.7022786701531565, + "grad_norm": 0.2844390869140625, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 940 + }, + { + "epoch": 0.709749719835637, + "grad_norm": 0.31263890862464905, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 950 + }, + { + "epoch": 0.7172207695181173, + "grad_norm": 0.3626808822154999, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 960 + }, + { + "epoch": 0.7246918192005977, + "grad_norm": 0.3322749733924866, + "learning_rate": 0.0002, + "loss": 1.853, + "step": 970 + }, + { + "epoch": 0.732162868883078, + "grad_norm": 0.29177871346473694, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 980 + }, + { + "epoch": 0.7396339185655585, + "grad_norm": 0.35405513644218445, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 990 + }, + { + "epoch": 0.7471049682480388, + "grad_norm": 0.39318400621414185, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1000 + }, + { + "epoch": 0.7545760179305192, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1010 + }, + { + "epoch": 0.7620470676129997, + "grad_norm": 0.3271748721599579, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 1020 + }, + { + "epoch": 0.76951811729548, + "grad_norm": 0.30883970856666565, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1030 + }, + { + "epoch": 0.7769891669779604, + "grad_norm": 0.3411838412284851, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 1040 + }, + { + "epoch": 0.7844602166604407, + "grad_norm": 0.30608129501342773, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 1050 + }, + { + "epoch": 0.7919312663429212, + "grad_norm": 0.30899080634117126, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 1060 + }, + { + "epoch": 0.7994023160254016, + "grad_norm": 0.3160453140735626, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 1070 + }, + { + "epoch": 0.8068733657078819, + "grad_norm": 0.30947187542915344, + "learning_rate": 0.0002, + "loss": 1.8452, + "step": 1080 + }, + { + "epoch": 0.8143444153903624, + "grad_norm": 0.3103134036064148, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1090 + }, + { + "epoch": 0.8218154650728428, + "grad_norm": 0.31771138310432434, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 1100 + }, + { + "epoch": 0.8292865147553231, + "grad_norm": 0.5860997438430786, + "learning_rate": 0.0002, + "loss": 1.7918, + "step": 1110 + }, + { + "epoch": 0.8367575644378035, + "grad_norm": 0.3230148255825043, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 1120 + }, + { + "epoch": 0.8442286141202839, + "grad_norm": 0.29611510038375854, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 1130 + }, + { + "epoch": 0.8516996638027643, + "grad_norm": 0.3373654782772064, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 1140 + }, + { + "epoch": 0.8591707134852447, + "grad_norm": 0.3474279046058655, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1150 + }, + { + "epoch": 0.866641763167725, + "grad_norm": 0.35057875514030457, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1160 + }, + { + "epoch": 0.8741128128502055, + "grad_norm": 0.39537495374679565, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 1170 + }, + { + "epoch": 0.8815838625326858, + "grad_norm": 0.3714233636856079, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1180 + }, + { + "epoch": 0.8890549122151662, + "grad_norm": 0.2950296998023987, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1190 + }, + { + "epoch": 0.8965259618976467, + "grad_norm": 0.38182979822158813, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 1200 + }, + { + "epoch": 0.903997011580127, + "grad_norm": 0.27883678674697876, + "learning_rate": 0.0002, + "loss": 1.827, + "step": 1210 + }, + { + "epoch": 0.9114680612626074, + "grad_norm": 0.33874374628067017, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1220 + }, + { + "epoch": 0.9189391109450877, + "grad_norm": 0.3014272153377533, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1230 + }, + { + "epoch": 0.9264101606275682, + "grad_norm": 0.3194271922111511, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 1240 + }, + { + "epoch": 0.9338812103100486, + "grad_norm": 0.3049403429031372, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1250 + }, + { + "epoch": 0.9413522599925289, + "grad_norm": 0.30621254444122314, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 1260 + }, + { + "epoch": 0.9488233096750094, + "grad_norm": 0.28675132989883423, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 1270 + }, + { + "epoch": 0.9562943593574897, + "grad_norm": 0.3322032690048218, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1280 + }, + { + "epoch": 0.9637654090399701, + "grad_norm": 0.35408294200897217, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1290 + }, + { + "epoch": 0.9712364587224505, + "grad_norm": 0.36386919021606445, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1300 + }, + { + "epoch": 0.9787075084049309, + "grad_norm": 0.32338324189186096, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 1310 + }, + { + "epoch": 0.9861785580874113, + "grad_norm": 0.3714013993740082, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 1320 + }, + { + "epoch": 0.9936496077698916, + "grad_norm": 0.3133082389831543, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 1330 + }, + { + "epoch": 0.9996264475158759, + "eval_loss": 1.8051470518112183, + "eval_runtime": 38.6332, + "eval_samples_per_second": 13.331, + "eval_steps_per_second": 1.682, + "step": 1338 + }, + { + "epoch": 1.001120657452372, + "grad_norm": 0.31595754623413086, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 1340 + }, + { + "epoch": 1.0085917071348525, + "grad_norm": 0.3095700144767761, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1350 + }, + { + "epoch": 1.0160627568173328, + "grad_norm": 0.34677496552467346, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1360 + }, + { + "epoch": 1.0235338064998132, + "grad_norm": 0.29108840227127075, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1370 + }, + { + "epoch": 1.0310048561822935, + "grad_norm": 0.32356950640678406, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 1380 + }, + { + "epoch": 1.038475905864774, + "grad_norm": 0.4200669229030609, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1390 + }, + { + "epoch": 1.0459469555472545, + "grad_norm": 0.3283711373806, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 1400 + }, + { + "epoch": 1.0534180052297348, + "grad_norm": 0.32898256182670593, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1410 + }, + { + "epoch": 1.0608890549122152, + "grad_norm": 0.38790300488471985, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 1420 + }, + { + "epoch": 1.0683601045946955, + "grad_norm": 0.339800089597702, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1430 + }, + { + "epoch": 1.075831154277176, + "grad_norm": 0.3548751175403595, + "learning_rate": 0.0002, + "loss": 1.7076, + "step": 1440 + }, + { + "epoch": 1.0833022039596563, + "grad_norm": 0.35114359855651855, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1450 + }, + { + "epoch": 1.0907732536421366, + "grad_norm": 0.35226720571517944, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 1460 + }, + { + "epoch": 1.0982443033246172, + "grad_norm": 0.33665576577186584, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 1470 + }, + { + "epoch": 1.1057153530070976, + "grad_norm": 0.363889217376709, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1480 + }, + { + "epoch": 1.113186402689578, + "grad_norm": 0.3826201856136322, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 1490 + }, + { + "epoch": 1.1206574523720583, + "grad_norm": 0.34058740735054016, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 1500 + }, + { + "epoch": 1.1281285020545386, + "grad_norm": 0.3462134301662445, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1510 + }, + { + "epoch": 1.135599551737019, + "grad_norm": 0.3396756052970886, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 1520 + }, + { + "epoch": 1.1430706014194993, + "grad_norm": 0.32004743814468384, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1530 + }, + { + "epoch": 1.15054165110198, + "grad_norm": 0.3397733271121979, + "learning_rate": 0.0002, + "loss": 1.743, + "step": 1540 + }, + { + "epoch": 1.1580127007844603, + "grad_norm": 0.3783262073993683, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 1550 + }, + { + "epoch": 1.1654837504669406, + "grad_norm": 0.35121291875839233, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1560 + }, + { + "epoch": 1.172954800149421, + "grad_norm": 0.35816895961761475, + "learning_rate": 0.0002, + "loss": 1.678, + "step": 1570 + }, + { + "epoch": 1.1804258498319014, + "grad_norm": 0.33843839168548584, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1580 + }, + { + "epoch": 1.1878968995143817, + "grad_norm": 0.3371972143650055, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 1590 + }, + { + "epoch": 1.195367949196862, + "grad_norm": 0.36016878485679626, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 1600 + }, + { + "epoch": 1.2028389988793426, + "grad_norm": 0.40879473090171814, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 1610 + }, + { + "epoch": 1.210310048561823, + "grad_norm": 0.3216715455055237, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 1620 + }, + { + "epoch": 1.2177810982443034, + "grad_norm": 0.4482610821723938, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1630 + }, + { + "epoch": 1.2252521479267837, + "grad_norm": 0.3257700502872467, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1640 + }, + { + "epoch": 1.232723197609264, + "grad_norm": 0.38646459579467773, + "learning_rate": 0.0002, + "loss": 1.7177, + "step": 1650 + }, + { + "epoch": 1.2401942472917444, + "grad_norm": 0.4081360697746277, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1660 + }, + { + "epoch": 1.2476652969742248, + "grad_norm": 0.4326848089694977, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 1670 + }, + { + "epoch": 1.2551363466567054, + "grad_norm": 0.346401572227478, + "learning_rate": 0.0002, + "loss": 1.6752, + "step": 1680 + }, + { + "epoch": 1.2626073963391857, + "grad_norm": 0.34536251425743103, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1690 + }, + { + "epoch": 1.270078446021666, + "grad_norm": 0.41359591484069824, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 1700 + }, + { + "epoch": 1.2775494957041464, + "grad_norm": 0.3530874252319336, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 1710 + }, + { + "epoch": 1.2850205453866268, + "grad_norm": 0.3702719211578369, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 1720 + }, + { + "epoch": 1.2924915950691072, + "grad_norm": 0.3703329563140869, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1730 + }, + { + "epoch": 1.2999626447515875, + "grad_norm": 0.37919729948043823, + "learning_rate": 0.0002, + "loss": 1.7221, + "step": 1740 + }, + { + "epoch": 1.307433694434068, + "grad_norm": 0.32526856660842896, + "learning_rate": 0.0002, + "loss": 1.7859, + "step": 1750 + }, + { + "epoch": 1.3149047441165485, + "grad_norm": 0.36752620339393616, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1760 + }, + { + "epoch": 1.3223757937990288, + "grad_norm": 0.3398192524909973, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1770 + }, + { + "epoch": 1.3298468434815092, + "grad_norm": 0.37435585260391235, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1780 + }, + { + "epoch": 1.3373178931639895, + "grad_norm": 0.35793280601501465, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1790 + }, + { + "epoch": 1.3447889428464699, + "grad_norm": 0.35481882095336914, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1800 + }, + { + "epoch": 1.3522599925289502, + "grad_norm": 0.3786393105983734, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1810 + }, + { + "epoch": 1.3597310422114308, + "grad_norm": 0.33245593309402466, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1820 + }, + { + "epoch": 1.3672020918939112, + "grad_norm": 0.35388344526290894, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1830 + }, + { + "epoch": 1.3746731415763915, + "grad_norm": 0.3695325553417206, + "learning_rate": 0.0002, + "loss": 1.6968, + "step": 1840 + }, + { + "epoch": 1.382144191258872, + "grad_norm": 0.3683604598045349, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1850 + }, + { + "epoch": 1.3896152409413522, + "grad_norm": 0.3753012418746948, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1860 + }, + { + "epoch": 1.3970862906238326, + "grad_norm": 0.3331069350242615, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 1870 + }, + { + "epoch": 1.404557340306313, + "grad_norm": 0.3877500295639038, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 1880 + }, + { + "epoch": 1.4120283899887935, + "grad_norm": 0.33525151014328003, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1890 + }, + { + "epoch": 1.4194994396712737, + "grad_norm": 0.3697299659252167, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1900 + }, + { + "epoch": 1.4269704893537543, + "grad_norm": 0.4029286205768585, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1910 + }, + { + "epoch": 1.4344415390362346, + "grad_norm": 0.3596203029155731, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 1920 + }, + { + "epoch": 1.441912588718715, + "grad_norm": 0.450783908367157, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 1930 + }, + { + "epoch": 1.4493836384011953, + "grad_norm": 0.3651481866836548, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1940 + }, + { + "epoch": 1.4568546880836757, + "grad_norm": 0.3608424663543701, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 1950 + }, + { + "epoch": 1.4643257377661563, + "grad_norm": 0.39684420824050903, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 1960 + }, + { + "epoch": 1.4717967874486364, + "grad_norm": 0.34618663787841797, + "learning_rate": 0.0002, + "loss": 1.7514, + "step": 1970 + }, + { + "epoch": 1.479267837131117, + "grad_norm": 0.4150386452674866, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1980 + }, + { + "epoch": 1.4867388868135973, + "grad_norm": 0.35500776767730713, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1990 + }, + { + "epoch": 1.4942099364960777, + "grad_norm": 0.344144344329834, + "learning_rate": 0.0002, + "loss": 1.7322, + "step": 2000 + }, + { + "epoch": 1.501680986178558, + "grad_norm": 0.3340149223804474, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2010 + }, + { + "epoch": 1.5091520358610384, + "grad_norm": 0.37685006856918335, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 2020 + }, + { + "epoch": 1.516623085543519, + "grad_norm": 0.3699876368045807, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 2030 + }, + { + "epoch": 1.5240941352259991, + "grad_norm": 0.3370307385921478, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 2040 + }, + { + "epoch": 1.5315651849084797, + "grad_norm": 0.37780630588531494, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 2050 + }, + { + "epoch": 1.53903623459096, + "grad_norm": 0.370259165763855, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 2060 + }, + { + "epoch": 1.5465072842734404, + "grad_norm": 0.3440011441707611, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 2070 + }, + { + "epoch": 1.5539783339559208, + "grad_norm": 0.40382063388824463, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 2080 + }, + { + "epoch": 1.5614493836384011, + "grad_norm": 0.38002029061317444, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 2090 + }, + { + "epoch": 1.5689204333208817, + "grad_norm": 0.3658451437950134, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2100 + }, + { + "epoch": 1.5763914830033618, + "grad_norm": 0.354842871427536, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 2110 + }, + { + "epoch": 1.5838625326858424, + "grad_norm": 0.34735530614852905, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 2120 + }, + { + "epoch": 1.5913335823683228, + "grad_norm": 0.377581924200058, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 2130 + }, + { + "epoch": 1.5988046320508031, + "grad_norm": 0.41254034638404846, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 2140 + }, + { + "epoch": 1.6062756817332835, + "grad_norm": 0.3630715310573578, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2150 + }, + { + "epoch": 1.6137467314157639, + "grad_norm": 0.36980143189430237, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 2160 + }, + { + "epoch": 1.6212177810982444, + "grad_norm": 0.3634769320487976, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2170 + }, + { + "epoch": 1.6286888307807246, + "grad_norm": 0.3794139623641968, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2180 + }, + { + "epoch": 1.6361598804632052, + "grad_norm": 0.359742134809494, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 2190 + }, + { + "epoch": 1.6436309301456855, + "grad_norm": 0.3770543932914734, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.6511019798281659, + "grad_norm": 0.3797036409378052, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 2210 + }, + { + "epoch": 1.6585730295106462, + "grad_norm": 0.35622093081474304, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 2220 + }, + { + "epoch": 1.6660440791931266, + "grad_norm": 0.34552520513534546, + "learning_rate": 0.0002, + "loss": 1.6615, + "step": 2230 + }, + { + "epoch": 1.6735151288756072, + "grad_norm": 0.379926860332489, + "learning_rate": 0.0002, + "loss": 1.7522, + "step": 2240 + }, + { + "epoch": 1.6809861785580873, + "grad_norm": 0.37083810567855835, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 2250 + }, + { + "epoch": 1.6884572282405679, + "grad_norm": 0.42746543884277344, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 2260 + }, + { + "epoch": 1.6959282779230482, + "grad_norm": 0.3372884690761566, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2270 + }, + { + "epoch": 1.7033993276055286, + "grad_norm": 0.35220256447792053, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2280 + }, + { + "epoch": 1.710870377288009, + "grad_norm": 0.3659130930900574, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 2290 + }, + { + "epoch": 1.7183414269704893, + "grad_norm": 0.37629297375679016, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2300 + }, + { + "epoch": 1.7258124766529699, + "grad_norm": 0.36312398314476013, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2310 + }, + { + "epoch": 1.73328352633545, + "grad_norm": 0.467709481716156, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 2320 + }, + { + "epoch": 1.7407545760179306, + "grad_norm": 0.38685527443885803, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2330 + }, + { + "epoch": 1.748225625700411, + "grad_norm": 0.3578338325023651, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 2340 + }, + { + "epoch": 1.7556966753828913, + "grad_norm": 0.36057502031326294, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2350 + }, + { + "epoch": 1.7631677250653717, + "grad_norm": 0.3615196645259857, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2360 + }, + { + "epoch": 1.770638774747852, + "grad_norm": 0.4118947684764862, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 2370 + }, + { + "epoch": 1.7781098244303326, + "grad_norm": 0.4067276120185852, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2380 + }, + { + "epoch": 1.7855808741128127, + "grad_norm": 0.3979823887348175, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2390 + }, + { + "epoch": 1.7930519237952933, + "grad_norm": 0.44045883417129517, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 2400 + }, + { + "epoch": 1.8005229734777737, + "grad_norm": 0.3998069167137146, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 2410 + }, + { + "epoch": 1.807994023160254, + "grad_norm": 0.3450094759464264, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 2420 + }, + { + "epoch": 1.8154650728427344, + "grad_norm": 0.3759009838104248, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2430 + }, + { + "epoch": 1.8229361225252148, + "grad_norm": 0.34347015619277954, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2440 + }, + { + "epoch": 1.8304071722076953, + "grad_norm": 0.3511228859424591, + "learning_rate": 0.0002, + "loss": 1.7345, + "step": 2450 + }, + { + "epoch": 1.8378782218901755, + "grad_norm": 0.36853715777397156, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 2460 + }, + { + "epoch": 1.845349271572656, + "grad_norm": 0.40659376978874207, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2470 + }, + { + "epoch": 1.8528203212551362, + "grad_norm": 0.39621320366859436, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 2480 + }, + { + "epoch": 1.8602913709376168, + "grad_norm": 0.3753979504108429, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 2490 + }, + { + "epoch": 1.8677624206200971, + "grad_norm": 0.3811938464641571, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2500 + }, + { + "epoch": 1.8752334703025775, + "grad_norm": 0.3432596027851105, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 2510 + }, + { + "epoch": 1.882704519985058, + "grad_norm": 0.3670712113380432, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 2520 + }, + { + "epoch": 1.8901755696675382, + "grad_norm": 0.40907177329063416, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2530 + }, + { + "epoch": 1.8976466193500188, + "grad_norm": 0.3821999728679657, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 2540 + }, + { + "epoch": 1.905117669032499, + "grad_norm": 0.36173978447914124, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 2550 + }, + { + "epoch": 1.9125887187149795, + "grad_norm": 0.38990336656570435, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 2560 + }, + { + "epoch": 1.9200597683974598, + "grad_norm": 0.35242322087287903, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 2570 + }, + { + "epoch": 1.9275308180799402, + "grad_norm": 0.3506428003311157, + "learning_rate": 0.0002, + "loss": 1.7268, + "step": 2580 + }, + { + "epoch": 1.9350018677624208, + "grad_norm": 0.39540135860443115, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2590 + }, + { + "epoch": 1.942472917444901, + "grad_norm": 0.3444725573062897, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2600 + }, + { + "epoch": 1.9499439671273815, + "grad_norm": 0.3963521718978882, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 2610 + }, + { + "epoch": 1.9574150168098616, + "grad_norm": 0.3689815402030945, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2620 + }, + { + "epoch": 1.9648860664923422, + "grad_norm": 0.3482626676559448, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 2630 + }, + { + "epoch": 1.9723571161748226, + "grad_norm": 0.35832616686820984, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2640 + }, + { + "epoch": 1.979828165857303, + "grad_norm": 0.4776208996772766, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2650 + }, + { + "epoch": 1.9872992155397835, + "grad_norm": 0.32570165395736694, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2660 + }, + { + "epoch": 1.9947702652222636, + "grad_norm": 0.3380725085735321, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2670 + }, + { + "epoch": 2.0, + "eval_loss": 1.8046749830245972, + "eval_runtime": 38.5096, + "eval_samples_per_second": 13.373, + "eval_steps_per_second": 1.688, + "step": 2677 + }, + { + "epoch": 2.002241314904744, + "grad_norm": 0.36817631125450134, + "learning_rate": 0.0002, + "loss": 1.7265, + "step": 2680 + }, + { + "epoch": 2.0097123645872244, + "grad_norm": 0.4056456685066223, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2690 + }, + { + "epoch": 2.017183414269705, + "grad_norm": 0.37416863441467285, + "learning_rate": 0.0002, + "loss": 1.5515, + "step": 2700 + }, + { + "epoch": 2.024654463952185, + "grad_norm": 0.4273638427257538, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2710 + }, + { + "epoch": 2.0321255136346656, + "grad_norm": 0.36497923731803894, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2720 + }, + { + "epoch": 2.0395965633171462, + "grad_norm": 0.5021994113922119, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 2730 + }, + { + "epoch": 2.0470676129996264, + "grad_norm": 0.45896220207214355, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 2740 + }, + { + "epoch": 2.054538662682107, + "grad_norm": 0.3973815143108368, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 2750 + }, + { + "epoch": 2.062009712364587, + "grad_norm": 0.4521815776824951, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2760 + }, + { + "epoch": 2.0694807620470677, + "grad_norm": 0.42775002121925354, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2770 + }, + { + "epoch": 2.076951811729548, + "grad_norm": 0.48158586025238037, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 2780 + }, + { + "epoch": 2.0844228614120284, + "grad_norm": 0.4612371623516083, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2790 + }, + { + "epoch": 2.091893911094509, + "grad_norm": 0.42536866664886475, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 2800 + }, + { + "epoch": 2.099364960776989, + "grad_norm": 0.48515772819519043, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 2810 + }, + { + "epoch": 2.1068360104594697, + "grad_norm": 0.41418662667274475, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2820 + }, + { + "epoch": 2.11430706014195, + "grad_norm": 0.4683697819709778, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2830 + }, + { + "epoch": 2.1217781098244304, + "grad_norm": 0.4484657049179077, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2840 + }, + { + "epoch": 2.1292491595069105, + "grad_norm": 0.6621400713920593, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 2850 + }, + { + "epoch": 2.136720209189391, + "grad_norm": 0.45074811577796936, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 2860 + }, + { + "epoch": 2.1441912588718717, + "grad_norm": 0.3513113558292389, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2870 + }, + { + "epoch": 2.151662308554352, + "grad_norm": 0.40411314368247986, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 2880 + }, + { + "epoch": 2.1591333582368324, + "grad_norm": 0.4121065139770508, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 2890 + }, + { + "epoch": 2.1666044079193125, + "grad_norm": 0.44723689556121826, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 2900 + }, + { + "epoch": 2.174075457601793, + "grad_norm": 0.4226122498512268, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 2910 + }, + { + "epoch": 2.1815465072842732, + "grad_norm": 0.46617650985717773, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2920 + }, + { + "epoch": 2.189017556966754, + "grad_norm": 0.4506422281265259, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 2930 + }, + { + "epoch": 2.1964886066492344, + "grad_norm": 0.4892672896385193, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2940 + }, + { + "epoch": 2.2039596563317145, + "grad_norm": 0.44095516204833984, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2950 + }, + { + "epoch": 2.211430706014195, + "grad_norm": 0.41522109508514404, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 2960 + }, + { + "epoch": 2.2189017556966752, + "grad_norm": 0.4860858917236328, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2970 + }, + { + "epoch": 2.226372805379156, + "grad_norm": 0.42662516236305237, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2980 + }, + { + "epoch": 2.233843855061636, + "grad_norm": 0.4390648305416107, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2990 + }, + { + "epoch": 2.2413149047441165, + "grad_norm": 0.47515565156936646, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 3000 + }, + { + "epoch": 2.248785954426597, + "grad_norm": 0.4104543924331665, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 3010 + }, + { + "epoch": 2.2562570041090773, + "grad_norm": 0.4404028654098511, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 3020 + }, + { + "epoch": 2.263728053791558, + "grad_norm": 0.4717366695404053, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3030 + }, + { + "epoch": 2.271199103474038, + "grad_norm": 0.48345857858657837, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 3040 + }, + { + "epoch": 2.2786701531565186, + "grad_norm": 0.5312452912330627, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 3050 + }, + { + "epoch": 2.2861412028389987, + "grad_norm": 0.5073099732398987, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 3060 + }, + { + "epoch": 2.2936122525214793, + "grad_norm": 0.5027463436126709, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 3070 + }, + { + "epoch": 2.30108330220396, + "grad_norm": 0.5436304807662964, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3080 + }, + { + "epoch": 2.30855435188644, + "grad_norm": 0.4701065123081207, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 3090 + }, + { + "epoch": 2.3160254015689206, + "grad_norm": 0.46988746523857117, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 3100 + }, + { + "epoch": 2.3234964512514007, + "grad_norm": 0.45112869143486023, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 3110 + }, + { + "epoch": 2.3309675009338813, + "grad_norm": 0.5173566937446594, + "learning_rate": 0.0002, + "loss": 1.6291, + "step": 3120 + }, + { + "epoch": 2.3384385506163614, + "grad_norm": 0.40345850586891174, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 3130 + }, + { + "epoch": 2.345909600298842, + "grad_norm": 0.4218924939632416, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3140 + }, + { + "epoch": 2.3533806499813226, + "grad_norm": 0.41857317090034485, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 3150 + }, + { + "epoch": 2.3608516996638027, + "grad_norm": 0.4197218418121338, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 3160 + }, + { + "epoch": 2.3683227493462833, + "grad_norm": 0.4260677397251129, + "learning_rate": 0.0002, + "loss": 1.6572, + "step": 3170 + }, + { + "epoch": 2.3757937990287634, + "grad_norm": 0.4209042191505432, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3180 + }, + { + "epoch": 2.383264848711244, + "grad_norm": 0.4092234969139099, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3190 + }, + { + "epoch": 2.390735898393724, + "grad_norm": 0.4928431510925293, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 3200 + }, + { + "epoch": 2.3982069480762047, + "grad_norm": 0.49252402782440186, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3210 + }, + { + "epoch": 2.4056779977586853, + "grad_norm": 0.4368397295475006, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3220 + }, + { + "epoch": 2.4131490474411654, + "grad_norm": 0.46122390031814575, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 3230 + }, + { + "epoch": 2.420620097123646, + "grad_norm": 0.4272301197052002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.428091146806126, + "grad_norm": 0.41480937600135803, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 3250 + }, + { + "epoch": 2.4355621964886067, + "grad_norm": 0.48911941051483154, + "learning_rate": 0.0002, + "loss": 1.6281, + "step": 3260 + }, + { + "epoch": 2.443033246171087, + "grad_norm": 0.4444098472595215, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 3270 + }, + { + "epoch": 2.4505042958535674, + "grad_norm": 0.5111684799194336, + "learning_rate": 0.0002, + "loss": 1.6961, + "step": 3280 + }, + { + "epoch": 2.457975345536048, + "grad_norm": 0.5058825016021729, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 3290 + }, + { + "epoch": 2.465446395218528, + "grad_norm": 0.44173210859298706, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3300 + }, + { + "epoch": 2.4729174449010087, + "grad_norm": 0.4659745991230011, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 3310 + }, + { + "epoch": 2.480388494583489, + "grad_norm": 0.47237497568130493, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3320 + }, + { + "epoch": 2.4878595442659694, + "grad_norm": 0.47303131222724915, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 3330 + }, + { + "epoch": 2.4953305939484496, + "grad_norm": 0.4522389769554138, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 3340 + }, + { + "epoch": 2.50280164363093, + "grad_norm": 0.4467332363128662, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3350 + }, + { + "epoch": 2.5102726933134107, + "grad_norm": 0.4413762092590332, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3360 + }, + { + "epoch": 2.517743742995891, + "grad_norm": 0.495514452457428, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 3370 + }, + { + "epoch": 2.5252147926783715, + "grad_norm": 0.4429773986339569, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 3380 + }, + { + "epoch": 2.5326858423608516, + "grad_norm": 0.4589079022407532, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3390 + }, + { + "epoch": 2.540156892043332, + "grad_norm": 0.4683997333049774, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 3400 + }, + { + "epoch": 2.5476279417258123, + "grad_norm": 0.4651731252670288, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 3410 + }, + { + "epoch": 2.555098991408293, + "grad_norm": 0.45818084478378296, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3420 + }, + { + "epoch": 2.5625700410907735, + "grad_norm": 0.45209529995918274, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.5700410907732536, + "grad_norm": 0.4344733655452728, + "learning_rate": 0.0002, + "loss": 1.5606, + "step": 3440 + }, + { + "epoch": 2.577512140455734, + "grad_norm": 0.47435566782951355, + "learning_rate": 0.0002, + "loss": 1.6748, + "step": 3450 + }, + { + "epoch": 2.5849831901382143, + "grad_norm": 0.43841999769210815, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 3460 + }, + { + "epoch": 2.592454239820695, + "grad_norm": 0.4323869049549103, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 3470 + }, + { + "epoch": 2.599925289503175, + "grad_norm": 0.44355881214141846, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 3480 + }, + { + "epoch": 2.6073963391856556, + "grad_norm": 0.45847779512405396, + "learning_rate": 0.0002, + "loss": 1.665, + "step": 3490 + }, + { + "epoch": 2.614867388868136, + "grad_norm": 0.4411061704158783, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3500 + }, + { + "epoch": 2.6223384385506163, + "grad_norm": 0.4446796178817749, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3510 + }, + { + "epoch": 2.629809488233097, + "grad_norm": 0.41969653964042664, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3520 + }, + { + "epoch": 2.637280537915577, + "grad_norm": 0.5263747572898865, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 3530 + }, + { + "epoch": 2.6447515875980576, + "grad_norm": 0.47719451785087585, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3540 + }, + { + "epoch": 2.6522226372805378, + "grad_norm": 0.46574118733406067, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 3550 + }, + { + "epoch": 2.6596936869630183, + "grad_norm": 0.46867135167121887, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 3560 + }, + { + "epoch": 2.667164736645499, + "grad_norm": 0.4441198706626892, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3570 + }, + { + "epoch": 2.674635786327979, + "grad_norm": 0.4871319830417633, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3580 + }, + { + "epoch": 2.6821068360104596, + "grad_norm": 0.43900373578071594, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 3590 + }, + { + "epoch": 2.6895778856929398, + "grad_norm": 0.42509549856185913, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 3600 + }, + { + "epoch": 2.6970489353754203, + "grad_norm": 0.4691086709499359, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 3610 + }, + { + "epoch": 2.7045199850579005, + "grad_norm": 0.46318942308425903, + "learning_rate": 0.0002, + "loss": 1.5491, + "step": 3620 + }, + { + "epoch": 2.711991034740381, + "grad_norm": 0.44631096720695496, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3630 + }, + { + "epoch": 2.7194620844228616, + "grad_norm": 0.42315489053726196, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3640 + }, + { + "epoch": 2.7269331341053418, + "grad_norm": 0.4971241056919098, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3650 + }, + { + "epoch": 2.7344041837878224, + "grad_norm": 0.4578486382961273, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 3660 + }, + { + "epoch": 2.7418752334703025, + "grad_norm": 0.46584776043891907, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3670 + }, + { + "epoch": 2.749346283152783, + "grad_norm": 0.4951731264591217, + "learning_rate": 0.0002, + "loss": 1.6809, + "step": 3680 + }, + { + "epoch": 2.756817332835263, + "grad_norm": 0.4935225546360016, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 3690 + }, + { + "epoch": 2.764288382517744, + "grad_norm": 0.41805586218833923, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3700 + }, + { + "epoch": 2.7717594322002244, + "grad_norm": 0.4417555630207062, + "learning_rate": 0.0002, + "loss": 1.7173, + "step": 3710 + }, + { + "epoch": 2.7792304818827045, + "grad_norm": 0.48229655623435974, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 3720 + }, + { + "epoch": 2.786701531565185, + "grad_norm": 0.48562315106391907, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3730 + }, + { + "epoch": 2.794172581247665, + "grad_norm": 0.4473940432071686, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 3740 + }, + { + "epoch": 2.801643630930146, + "grad_norm": 0.4626813232898712, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3750 + }, + { + "epoch": 2.809114680612626, + "grad_norm": 0.4339792728424072, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 3760 + }, + { + "epoch": 2.8165857302951065, + "grad_norm": 0.5250858068466187, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 3770 + }, + { + "epoch": 2.824056779977587, + "grad_norm": 0.4537523090839386, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3780 + }, + { + "epoch": 2.831527829660067, + "grad_norm": 0.5646113157272339, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3790 + }, + { + "epoch": 2.8389988793425474, + "grad_norm": 0.44243332743644714, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 3800 + }, + { + "epoch": 2.846469929025028, + "grad_norm": 0.4585791826248169, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3810 + }, + { + "epoch": 2.8539409787075085, + "grad_norm": 0.489702045917511, + "learning_rate": 0.0002, + "loss": 1.6854, + "step": 3820 + }, + { + "epoch": 2.8614120283899886, + "grad_norm": 0.502470850944519, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 3830 + }, + { + "epoch": 2.8688830780724692, + "grad_norm": 0.4395960867404938, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3840 + }, + { + "epoch": 2.87635412775495, + "grad_norm": 0.4348670244216919, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3850 + }, + { + "epoch": 2.88382517743743, + "grad_norm": 0.48852720856666565, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3860 + }, + { + "epoch": 2.89129622711991, + "grad_norm": 0.45317450165748596, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3870 + }, + { + "epoch": 2.8987672768023907, + "grad_norm": 0.4732758700847626, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3880 + }, + { + "epoch": 2.9062383264848712, + "grad_norm": 0.45238012075424194, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3890 + }, + { + "epoch": 2.9137093761673514, + "grad_norm": 0.48838064074516296, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 3900 + }, + { + "epoch": 2.921180425849832, + "grad_norm": 0.43496349453926086, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 3910 + }, + { + "epoch": 2.9286514755323125, + "grad_norm": 0.47963935136795044, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 3920 + }, + { + "epoch": 2.9361225252147927, + "grad_norm": 0.4544987976551056, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 3930 + }, + { + "epoch": 2.943593574897273, + "grad_norm": 0.4622892141342163, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 3940 + }, + { + "epoch": 2.9510646245797534, + "grad_norm": 0.47026222944259644, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 3950 + }, + { + "epoch": 2.958535674262234, + "grad_norm": 0.4549552798271179, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 3960 + }, + { + "epoch": 2.966006723944714, + "grad_norm": 0.46647515892982483, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3970 + }, + { + "epoch": 2.9734777736271947, + "grad_norm": 0.45095112919807434, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 3980 + }, + { + "epoch": 2.9809488233096753, + "grad_norm": 0.4690017104148865, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 3990 + }, + { + "epoch": 2.9884198729921554, + "grad_norm": 0.4603444039821625, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 4000 + }, + { + "epoch": 2.9958909226746355, + "grad_norm": 0.4743294417858124, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 4010 + }, + { + "epoch": 2.999626447515876, + "eval_loss": 1.8252571821212769, + "eval_runtime": 38.7853, + "eval_samples_per_second": 13.278, + "eval_steps_per_second": 1.676, + "step": 4015 + }, + { + "epoch": 3.003361972357116, + "grad_norm": 0.4919724464416504, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 4020 + }, + { + "epoch": 3.0108330220395967, + "grad_norm": 0.4747185707092285, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4030 + }, + { + "epoch": 3.018304071722077, + "grad_norm": 0.4797595143318176, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 4040 + }, + { + "epoch": 3.0257751214045574, + "grad_norm": 0.5450999140739441, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 4050 + }, + { + "epoch": 3.0332461710870375, + "grad_norm": 0.49058812856674194, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4060 + }, + { + "epoch": 3.040717220769518, + "grad_norm": 0.5219563841819763, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4070 + }, + { + "epoch": 3.0481882704519987, + "grad_norm": 0.515628457069397, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 4080 + }, + { + "epoch": 3.055659320134479, + "grad_norm": 0.6145984530448914, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 4090 + }, + { + "epoch": 3.0631303698169594, + "grad_norm": 0.6067144274711609, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 4100 + }, + { + "epoch": 3.0706014194994395, + "grad_norm": 0.5773133039474487, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4110 + }, + { + "epoch": 3.07807246918192, + "grad_norm": 0.6894241571426392, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 4120 + }, + { + "epoch": 3.0855435188644003, + "grad_norm": 0.6422514915466309, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4130 + }, + { + "epoch": 3.093014568546881, + "grad_norm": 0.6119855046272278, + "learning_rate": 0.0002, + "loss": 1.4724, + "step": 4140 + }, + { + "epoch": 3.1004856182293614, + "grad_norm": 0.5847280025482178, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 4150 + }, + { + "epoch": 3.1079566679118416, + "grad_norm": 0.5401515960693359, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4160 + }, + { + "epoch": 3.115427717594322, + "grad_norm": 0.6501587629318237, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 4170 + }, + { + "epoch": 3.1228987672768023, + "grad_norm": 0.5988039374351501, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 4180 + }, + { + "epoch": 3.130369816959283, + "grad_norm": 0.4982665181159973, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 4190 + }, + { + "epoch": 3.137840866641763, + "grad_norm": 0.5548039078712463, + "learning_rate": 0.0002, + "loss": 1.5078, + "step": 4200 + }, + { + "epoch": 3.1453119163242436, + "grad_norm": 0.5920777320861816, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 4210 + }, + { + "epoch": 3.152782966006724, + "grad_norm": 0.6965190172195435, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 4220 + }, + { + "epoch": 3.1602540156892043, + "grad_norm": 0.5196244716644287, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4230 + }, + { + "epoch": 3.167725065371685, + "grad_norm": 0.6942682266235352, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 4240 + }, + { + "epoch": 3.175196115054165, + "grad_norm": 0.5765156149864197, + "learning_rate": 0.0002, + "loss": 1.5407, + "step": 4250 + }, + { + "epoch": 3.1826671647366456, + "grad_norm": 0.5801976919174194, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 4260 + }, + { + "epoch": 3.1901382144191257, + "grad_norm": 0.6260752081871033, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4270 + }, + { + "epoch": 3.1976092641016063, + "grad_norm": 0.6610770225524902, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 4280 + }, + { + "epoch": 3.205080313784087, + "grad_norm": 0.5762143135070801, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 4290 + }, + { + "epoch": 3.212551363466567, + "grad_norm": 0.5926990509033203, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 4300 + }, + { + "epoch": 3.2200224131490476, + "grad_norm": 0.7373854517936707, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 4310 + }, + { + "epoch": 3.2274934628315277, + "grad_norm": 0.5963311195373535, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 4320 + }, + { + "epoch": 3.2349645125140083, + "grad_norm": 0.5754616856575012, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 4330 + }, + { + "epoch": 3.2424355621964884, + "grad_norm": 0.6116095781326294, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 4340 + }, + { + "epoch": 3.249906611878969, + "grad_norm": 0.6001536846160889, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 4350 + }, + { + "epoch": 3.257377661561449, + "grad_norm": 0.5270227789878845, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 4360 + }, + { + "epoch": 3.2648487112439297, + "grad_norm": 0.6666602492332458, + "learning_rate": 0.0002, + "loss": 1.5235, + "step": 4370 + }, + { + "epoch": 3.2723197609264103, + "grad_norm": 0.520310640335083, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 4380 + }, + { + "epoch": 3.2797908106088904, + "grad_norm": 0.5165975093841553, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 4390 + }, + { + "epoch": 3.287261860291371, + "grad_norm": 0.6080228686332703, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4400 + }, + { + "epoch": 3.294732909973851, + "grad_norm": 0.670122504234314, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 4410 + }, + { + "epoch": 3.3022039596563317, + "grad_norm": 0.6019457578659058, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 4420 + }, + { + "epoch": 3.309675009338812, + "grad_norm": 0.5519300103187561, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 4430 + }, + { + "epoch": 3.3171460590212924, + "grad_norm": 0.5958521962165833, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 4440 + }, + { + "epoch": 3.324617108703773, + "grad_norm": 0.5552705526351929, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4450 + }, + { + "epoch": 3.332088158386253, + "grad_norm": 0.6583784818649292, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 4460 + }, + { + "epoch": 3.3395592080687337, + "grad_norm": 0.5815939903259277, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4470 + }, + { + "epoch": 3.347030257751214, + "grad_norm": 1.3342205286026, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 4480 + }, + { + "epoch": 3.3545013074336945, + "grad_norm": 0.6341500878334045, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 4490 + }, + { + "epoch": 3.3619723571161746, + "grad_norm": 0.6384079456329346, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 4500 + }, + { + "epoch": 3.369443406798655, + "grad_norm": 0.6098346710205078, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 4510 + }, + { + "epoch": 3.3769144564811358, + "grad_norm": 0.5958296656608582, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4520 + }, + { + "epoch": 3.384385506163616, + "grad_norm": 0.6157881617546082, + "learning_rate": 0.0002, + "loss": 1.5171, + "step": 4530 + }, + { + "epoch": 3.3918565558460965, + "grad_norm": 0.5671007037162781, + "learning_rate": 0.0002, + "loss": 1.569, + "step": 4540 + }, + { + "epoch": 3.3993276055285766, + "grad_norm": 0.6203294992446899, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 4550 + }, + { + "epoch": 3.406798655211057, + "grad_norm": 0.6743317246437073, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 4560 + }, + { + "epoch": 3.4142697048935373, + "grad_norm": 0.731765627861023, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4570 + }, + { + "epoch": 3.421740754576018, + "grad_norm": 0.6285187602043152, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 4580 + }, + { + "epoch": 3.4292118042584985, + "grad_norm": 0.612680196762085, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 4590 + }, + { + "epoch": 3.4366828539409786, + "grad_norm": 0.6413681507110596, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 4600 + }, + { + "epoch": 3.444153903623459, + "grad_norm": 0.6240990161895752, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4610 + }, + { + "epoch": 3.4516249533059393, + "grad_norm": 0.5095735192298889, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4620 + }, + { + "epoch": 3.45909600298842, + "grad_norm": 0.5699611902236938, + "learning_rate": 0.0002, + "loss": 1.4906, + "step": 4630 + }, + { + "epoch": 3.4665670526709, + "grad_norm": 0.7289775609970093, + "learning_rate": 0.0002, + "loss": 1.5176, + "step": 4640 + }, + { + "epoch": 3.4740381023533806, + "grad_norm": 0.6211609840393066, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 4650 + }, + { + "epoch": 3.481509152035861, + "grad_norm": 0.5714802145957947, + "learning_rate": 0.0002, + "loss": 1.533, + "step": 4660 + }, + { + "epoch": 3.4889802017183413, + "grad_norm": 0.6287049651145935, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 4670 + }, + { + "epoch": 3.496451251400822, + "grad_norm": 0.5480595827102661, + "learning_rate": 0.0002, + "loss": 1.4212, + "step": 4680 + }, + { + "epoch": 3.503922301083302, + "grad_norm": 0.5683253407478333, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4690 + }, + { + "epoch": 3.5113933507657826, + "grad_norm": 0.601140558719635, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4700 + }, + { + "epoch": 3.5188644004482628, + "grad_norm": 0.5344498157501221, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 4710 + }, + { + "epoch": 3.5263354501307433, + "grad_norm": 0.5739690661430359, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4720 + }, + { + "epoch": 3.533806499813224, + "grad_norm": 0.5640085935592651, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 4730 + }, + { + "epoch": 3.541277549495704, + "grad_norm": 0.5967805981636047, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 4740 + }, + { + "epoch": 3.5487485991781846, + "grad_norm": 0.6138835549354553, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4750 + }, + { + "epoch": 3.5562196488606648, + "grad_norm": 0.6779900193214417, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 4760 + }, + { + "epoch": 3.5636906985431454, + "grad_norm": 0.6122010350227356, + "learning_rate": 0.0002, + "loss": 1.4917, + "step": 4770 + }, + { + "epoch": 3.5711617482256255, + "grad_norm": 0.5685241222381592, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4780 + }, + { + "epoch": 3.578632797908106, + "grad_norm": 0.604583203792572, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 4790 + }, + { + "epoch": 3.5861038475905866, + "grad_norm": 0.651165246963501, + "learning_rate": 0.0002, + "loss": 1.4514, + "step": 4800 + }, + { + "epoch": 3.593574897273067, + "grad_norm": 0.6398511528968811, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 4810 + }, + { + "epoch": 3.6010459469555474, + "grad_norm": 0.6444641351699829, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4820 + }, + { + "epoch": 3.6085169966380275, + "grad_norm": 0.6018481850624084, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 4830 + }, + { + "epoch": 3.615988046320508, + "grad_norm": 0.6025291085243225, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 4840 + }, + { + "epoch": 3.623459096002988, + "grad_norm": 0.6810156106948853, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 4850 + }, + { + "epoch": 3.630930145685469, + "grad_norm": 0.6408044695854187, + "learning_rate": 0.0002, + "loss": 1.5299, + "step": 4860 + }, + { + "epoch": 3.6384011953679494, + "grad_norm": 0.5608272552490234, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4870 + }, + { + "epoch": 3.6458722450504295, + "grad_norm": 0.6136814951896667, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 4880 + }, + { + "epoch": 3.65334329473291, + "grad_norm": 0.5927900075912476, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4890 + }, + { + "epoch": 3.66081434441539, + "grad_norm": 0.5336901545524597, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 4900 + }, + { + "epoch": 3.668285394097871, + "grad_norm": 0.7823320627212524, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 4910 + }, + { + "epoch": 3.675756443780351, + "grad_norm": 0.6703504323959351, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 4920 + }, + { + "epoch": 3.6832274934628315, + "grad_norm": 0.6061160564422607, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 4930 + }, + { + "epoch": 3.690698543145312, + "grad_norm": 0.6237227916717529, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4940 + }, + { + "epoch": 3.6981695928277922, + "grad_norm": 0.5985278487205505, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 4950 + }, + { + "epoch": 3.705640642510273, + "grad_norm": 0.6483839750289917, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 4960 + }, + { + "epoch": 3.713111692192753, + "grad_norm": 0.5788805484771729, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 4970 + }, + { + "epoch": 3.7205827418752335, + "grad_norm": 0.5609974265098572, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 4980 + }, + { + "epoch": 3.7280537915577137, + "grad_norm": 0.5681300759315491, + "learning_rate": 0.0002, + "loss": 1.4759, + "step": 4990 + }, + { + "epoch": 3.7355248412401942, + "grad_norm": 0.5860186219215393, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 5000 + }, + { + "epoch": 3.742995890922675, + "grad_norm": 0.5718157291412354, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 5010 + }, + { + "epoch": 3.750466940605155, + "grad_norm": 0.6173721551895142, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 5020 + }, + { + "epoch": 3.7579379902876355, + "grad_norm": 0.629152238368988, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 5030 + }, + { + "epoch": 3.7654090399701157, + "grad_norm": 0.5666284561157227, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 5040 + }, + { + "epoch": 3.7728800896525962, + "grad_norm": 0.6053005456924438, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5050 + }, + { + "epoch": 3.7803511393350764, + "grad_norm": 0.5870583057403564, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 5060 + }, + { + "epoch": 3.787822189017557, + "grad_norm": 0.5422009229660034, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5070 + }, + { + "epoch": 3.7952932387000375, + "grad_norm": 0.5396918058395386, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 5080 + }, + { + "epoch": 3.8027642883825177, + "grad_norm": 0.5544713139533997, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 5090 + }, + { + "epoch": 3.8102353380649983, + "grad_norm": 0.5983749628067017, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5100 + }, + { + "epoch": 3.8177063877474784, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 5110 + }, + { + "epoch": 3.825177437429959, + "grad_norm": 0.5436882376670837, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 5120 + }, + { + "epoch": 3.832648487112439, + "grad_norm": 0.5453617572784424, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 5130 + }, + { + "epoch": 3.8401195367949197, + "grad_norm": 0.6269069314002991, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 5140 + }, + { + "epoch": 3.8475905864774003, + "grad_norm": 0.6189185380935669, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 5150 + }, + { + "epoch": 3.8550616361598804, + "grad_norm": 0.6653388142585754, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 5160 + }, + { + "epoch": 3.862532685842361, + "grad_norm": 0.5771768689155579, + "learning_rate": 0.0002, + "loss": 1.5075, + "step": 5170 + }, + { + "epoch": 3.870003735524841, + "grad_norm": 0.6052790880203247, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5180 + }, + { + "epoch": 3.8774747852073217, + "grad_norm": 0.6572316884994507, + "learning_rate": 0.0002, + "loss": 1.4987, + "step": 5190 + }, + { + "epoch": 3.884945834889802, + "grad_norm": 0.670576810836792, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 5200 + }, + { + "epoch": 3.8924168845722824, + "grad_norm": 0.5728798508644104, + "learning_rate": 0.0002, + "loss": 1.4777, + "step": 5210 + }, + { + "epoch": 3.899887934254763, + "grad_norm": 0.6340774297714233, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 5220 + }, + { + "epoch": 3.907358983937243, + "grad_norm": 0.5981315970420837, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 5230 + }, + { + "epoch": 3.9148300336197237, + "grad_norm": 0.6212025880813599, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 5240 + }, + { + "epoch": 3.922301083302204, + "grad_norm": 0.6202296018600464, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5250 + }, + { + "epoch": 3.9297721329846844, + "grad_norm": 0.6159142255783081, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 5260 + }, + { + "epoch": 3.9372431826671646, + "grad_norm": 0.6519438624382019, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 5270 + }, + { + "epoch": 3.944714232349645, + "grad_norm": 0.539813756942749, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5280 + }, + { + "epoch": 3.9521852820321257, + "grad_norm": 0.6443665027618408, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 5290 + }, + { + "epoch": 3.959656331714606, + "grad_norm": 0.6635757684707642, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 5300 + }, + { + "epoch": 3.9671273813970864, + "grad_norm": 0.589363157749176, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 5310 + }, + { + "epoch": 3.9745984310795666, + "grad_norm": 0.5788735747337341, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 5320 + }, + { + "epoch": 3.982069480762047, + "grad_norm": 0.5976864695549011, + "learning_rate": 0.0002, + "loss": 1.5607, + "step": 5330 + }, + { + "epoch": 3.9895405304445273, + "grad_norm": 0.6624067425727844, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 5340 + }, + { + "epoch": 3.997011580127008, + "grad_norm": 0.6738956570625305, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 5350 + }, + { + "epoch": 4.0, + "eval_loss": 1.868006944656372, + "eval_runtime": 38.5153, + "eval_samples_per_second": 13.371, + "eval_steps_per_second": 1.688, + "step": 5354 + } + ], + "logging_steps": 10, + "max_steps": 10704, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4777113190845645e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-5354/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b524961a7cacdf0b8dad16e7f55191ca2cbf84b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf78460aaba2dfe8b0336cc414547b7bc27589452c9b9f4ca884bdc52f50bc9b +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cea8f8db4feb4d3342d8ba1d777f602eb342a958 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:273d385eaf26e07c5a4d9fce01ecf1c9e52ba8ae5bf7ab1de30a91ed640e217d +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7dbddeba79e26dee6931b482dbf74d3fb6512ceb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c08434e14019b3642e7ee20049bf0db48349e255ee7fadc6ceced85de3293e5f +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f570954cd0dd056cda1acf31c975a205e6074667 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11c060c2039ed35dedb25f4237d17aaa81972fd44cf51b001f2fb57693fb37dd +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0478ceb5cf64acd9d9e34e9143bef5bd316acbc4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/trainer_state.json @@ -0,0 +1,4756 @@ +{ + "best_metric": 1.8046749830245972, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", + "epoch": 4.999626447515876, + "eval_steps": 10, + "global_step": 6692, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007471049682480389, + "grad_norm": 0.4912872612476349, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 10 + }, + { + "epoch": 0.014942099364960777, + "grad_norm": 0.4856316149234772, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 20 + }, + { + "epoch": 0.022413149047441166, + "grad_norm": 0.47683125734329224, + "learning_rate": 0.0002, + "loss": 2.0957, + "step": 30 + }, + { + "epoch": 0.029884198729921554, + "grad_norm": 0.515082597732544, + "learning_rate": 0.0002, + "loss": 1.8908, + "step": 40 + }, + { + "epoch": 0.03735524841240194, + "grad_norm": 0.5299215316772461, + "learning_rate": 0.0002, + "loss": 1.9704, + "step": 50 + }, + { + "epoch": 0.04482629809488233, + "grad_norm": 0.4951399862766266, + "learning_rate": 0.0002, + "loss": 1.9225, + "step": 60 + }, + { + "epoch": 0.05229734777736272, + "grad_norm": 0.48079821467399597, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05976839745984311, + "grad_norm": 0.49402132630348206, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 80 + }, + { + "epoch": 0.0672394471423235, + "grad_norm": 0.4778193235397339, + "learning_rate": 0.0002, + "loss": 1.8691, + "step": 90 + }, + { + "epoch": 0.07471049682480388, + "grad_norm": 0.42472657561302185, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 100 + }, + { + "epoch": 0.08218154650728428, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0002, + "loss": 1.8744, + "step": 110 + }, + { + "epoch": 0.08965259618976466, + "grad_norm": 0.4472862780094147, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 120 + }, + { + "epoch": 0.09712364587224505, + "grad_norm": 0.42596298456192017, + "learning_rate": 0.0002, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.10459469555472543, + "grad_norm": 0.46645811200141907, + "learning_rate": 0.0002, + "loss": 1.8015, + "step": 140 + }, + { + "epoch": 0.11206574523720583, + "grad_norm": 0.41041234135627747, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 150 + }, + { + "epoch": 0.11953679491968622, + "grad_norm": 0.5329819917678833, + "learning_rate": 0.0002, + "loss": 1.8276, + "step": 160 + }, + { + "epoch": 0.1270078446021666, + "grad_norm": 0.4065922200679779, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 170 + }, + { + "epoch": 0.134478894284647, + "grad_norm": 0.38406994938850403, + "learning_rate": 0.0002, + "loss": 1.8559, + "step": 180 + }, + { + "epoch": 0.14194994396712737, + "grad_norm": 0.4246881306171417, + "learning_rate": 0.0002, + "loss": 1.8647, + "step": 190 + }, + { + "epoch": 0.14942099364960776, + "grad_norm": 0.35136649012565613, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 200 + }, + { + "epoch": 0.15689204333208817, + "grad_norm": 0.43252742290496826, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.16436309301456856, + "grad_norm": 0.39236941933631897, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 220 + }, + { + "epoch": 0.17183414269704894, + "grad_norm": 0.3748249113559723, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 230 + }, + { + "epoch": 0.17930519237952933, + "grad_norm": 0.6432855725288391, + "learning_rate": 0.0002, + "loss": 1.866, + "step": 240 + }, + { + "epoch": 0.1867762420620097, + "grad_norm": 0.34874802827835083, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 250 + }, + { + "epoch": 0.1942472917444901, + "grad_norm": 0.3721984326839447, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 260 + }, + { + "epoch": 0.20171834142697048, + "grad_norm": 0.4339311420917511, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 270 + }, + { + "epoch": 0.20918939110945087, + "grad_norm": 0.4018215537071228, + "learning_rate": 0.0002, + "loss": 1.8665, + "step": 280 + }, + { + "epoch": 0.21666044079193125, + "grad_norm": 0.3278839886188507, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 290 + }, + { + "epoch": 0.22413149047441167, + "grad_norm": 0.36146077513694763, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 300 + }, + { + "epoch": 0.23160254015689205, + "grad_norm": 0.38175010681152344, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 310 + }, + { + "epoch": 0.23907358983937244, + "grad_norm": 0.44776618480682373, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.24654463952185282, + "grad_norm": 0.3933652937412262, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 330 + }, + { + "epoch": 0.2540156892043332, + "grad_norm": 0.3515005111694336, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 340 + }, + { + "epoch": 0.2614867388868136, + "grad_norm": 0.6683304309844971, + "learning_rate": 0.0002, + "loss": 1.8653, + "step": 350 + }, + { + "epoch": 0.268957788569294, + "grad_norm": 0.37093454599380493, + "learning_rate": 0.0002, + "loss": 1.8797, + "step": 360 + }, + { + "epoch": 0.2764288382517744, + "grad_norm": 0.3450651168823242, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 370 + }, + { + "epoch": 0.28389988793425475, + "grad_norm": 0.5140917301177979, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 380 + }, + { + "epoch": 0.29137093761673516, + "grad_norm": 0.32885563373565674, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 390 + }, + { + "epoch": 0.2988419872992155, + "grad_norm": 0.33962297439575195, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.30631303698169593, + "grad_norm": 0.3723141849040985, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 410 + }, + { + "epoch": 0.31378408666417634, + "grad_norm": 0.37173134088516235, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 420 + }, + { + "epoch": 0.3212551363466567, + "grad_norm": 0.33736956119537354, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 430 + }, + { + "epoch": 0.3287261860291371, + "grad_norm": 0.3602448105812073, + "learning_rate": 0.0002, + "loss": 1.8367, + "step": 440 + }, + { + "epoch": 0.33619723571161747, + "grad_norm": 0.3569699227809906, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 450 + }, + { + "epoch": 0.3436682853940979, + "grad_norm": 0.31009167432785034, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 460 + }, + { + "epoch": 0.35113933507657824, + "grad_norm": 0.5278693437576294, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 470 + }, + { + "epoch": 0.35861038475905865, + "grad_norm": 0.3587537109851837, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 480 + }, + { + "epoch": 0.366081434441539, + "grad_norm": 0.3859670162200928, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 490 + }, + { + "epoch": 0.3735524841240194, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 500 + }, + { + "epoch": 0.38102353380649984, + "grad_norm": 0.35052940249443054, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 510 + }, + { + "epoch": 0.3884945834889802, + "grad_norm": 0.2979494333267212, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 520 + }, + { + "epoch": 0.3959656331714606, + "grad_norm": 0.3062683343887329, + "learning_rate": 0.0002, + "loss": 1.8641, + "step": 530 + }, + { + "epoch": 0.40343668285394096, + "grad_norm": 0.3172847330570221, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 540 + }, + { + "epoch": 0.4109077325364214, + "grad_norm": 0.360435426235199, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 550 + }, + { + "epoch": 0.41837878221890173, + "grad_norm": 0.3427872359752655, + "learning_rate": 0.0002, + "loss": 1.9054, + "step": 560 + }, + { + "epoch": 0.42584983190138215, + "grad_norm": 0.34036558866500854, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 570 + }, + { + "epoch": 0.4333208815838625, + "grad_norm": 0.3365345299243927, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 580 + }, + { + "epoch": 0.4407919312663429, + "grad_norm": 0.35619041323661804, + "learning_rate": 0.0002, + "loss": 1.8328, + "step": 590 + }, + { + "epoch": 0.44826298094882333, + "grad_norm": 0.3569088280200958, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 600 + }, + { + "epoch": 0.4557340306313037, + "grad_norm": 0.3581278622150421, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 610 + }, + { + "epoch": 0.4632050803137841, + "grad_norm": 0.43197110295295715, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 620 + }, + { + "epoch": 0.47067612999626446, + "grad_norm": 0.33966198563575745, + "learning_rate": 0.0002, + "loss": 1.8257, + "step": 630 + }, + { + "epoch": 0.47814717967874487, + "grad_norm": 0.3343866467475891, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 640 + }, + { + "epoch": 0.48561822936122523, + "grad_norm": 0.33878564834594727, + "learning_rate": 0.0002, + "loss": 1.8191, + "step": 650 + }, + { + "epoch": 0.49308927904370564, + "grad_norm": 0.387195885181427, + "learning_rate": 0.0002, + "loss": 1.8801, + "step": 660 + }, + { + "epoch": 0.500560328726186, + "grad_norm": 0.3755440413951874, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 670 + }, + { + "epoch": 0.5080313784086664, + "grad_norm": 0.3272816836833954, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 680 + }, + { + "epoch": 0.5155024280911468, + "grad_norm": 0.36063864827156067, + "learning_rate": 0.0002, + "loss": 1.8156, + "step": 690 + }, + { + "epoch": 0.5229734777736272, + "grad_norm": 0.35317373275756836, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 700 + }, + { + "epoch": 0.5304445274561076, + "grad_norm": 0.3561195433139801, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 710 + }, + { + "epoch": 0.537915577138588, + "grad_norm": 0.31124624609947205, + "learning_rate": 0.0002, + "loss": 1.8149, + "step": 720 + }, + { + "epoch": 0.5453866268210683, + "grad_norm": 0.3294544517993927, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 730 + }, + { + "epoch": 0.5528576765035488, + "grad_norm": 0.31933900713920593, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 740 + }, + { + "epoch": 0.5603287261860291, + "grad_norm": 0.3226020634174347, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 750 + }, + { + "epoch": 0.5677997758685095, + "grad_norm": 0.3147525489330292, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 760 + }, + { + "epoch": 0.57527082555099, + "grad_norm": 0.32234328985214233, + "learning_rate": 0.0002, + "loss": 1.9028, + "step": 770 + }, + { + "epoch": 0.5827418752334703, + "grad_norm": 0.3258664309978485, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 780 + }, + { + "epoch": 0.5902129249159507, + "grad_norm": 0.3166961967945099, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 790 + }, + { + "epoch": 0.597683974598431, + "grad_norm": 0.35621458292007446, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 800 + }, + { + "epoch": 0.6051550242809115, + "grad_norm": 0.3236999213695526, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 810 + }, + { + "epoch": 0.6126260739633919, + "grad_norm": 0.2892923653125763, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 820 + }, + { + "epoch": 0.6200971236458722, + "grad_norm": 0.4098321497440338, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 830 + }, + { + "epoch": 0.6275681733283527, + "grad_norm": 0.3337118923664093, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 840 + }, + { + "epoch": 0.635039223010833, + "grad_norm": 0.30416029691696167, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 850 + }, + { + "epoch": 0.6425102726933134, + "grad_norm": 0.3361026346683502, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 860 + }, + { + "epoch": 0.6499813223757938, + "grad_norm": 0.3537365198135376, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 870 + }, + { + "epoch": 0.6574523720582742, + "grad_norm": 0.33854469656944275, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 880 + }, + { + "epoch": 0.6649234217407546, + "grad_norm": 0.3332272469997406, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 890 + }, + { + "epoch": 0.6723944714232349, + "grad_norm": 0.34954726696014404, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 900 + }, + { + "epoch": 0.6798655211057153, + "grad_norm": 0.2921750247478485, + "learning_rate": 0.0002, + "loss": 1.7917, + "step": 910 + }, + { + "epoch": 0.6873365707881958, + "grad_norm": 0.30508682131767273, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 920 + }, + { + "epoch": 0.6948076204706761, + "grad_norm": 0.32268425822257996, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 930 + }, + { + "epoch": 0.7022786701531565, + "grad_norm": 0.2844390869140625, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 940 + }, + { + "epoch": 0.709749719835637, + "grad_norm": 0.31263890862464905, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 950 + }, + { + "epoch": 0.7172207695181173, + "grad_norm": 0.3626808822154999, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 960 + }, + { + "epoch": 0.7246918192005977, + "grad_norm": 0.3322749733924866, + "learning_rate": 0.0002, + "loss": 1.853, + "step": 970 + }, + { + "epoch": 0.732162868883078, + "grad_norm": 0.29177871346473694, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 980 + }, + { + "epoch": 0.7396339185655585, + "grad_norm": 0.35405513644218445, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 990 + }, + { + "epoch": 0.7471049682480388, + "grad_norm": 0.39318400621414185, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1000 + }, + { + "epoch": 0.7545760179305192, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1010 + }, + { + "epoch": 0.7620470676129997, + "grad_norm": 0.3271748721599579, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 1020 + }, + { + "epoch": 0.76951811729548, + "grad_norm": 0.30883970856666565, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1030 + }, + { + "epoch": 0.7769891669779604, + "grad_norm": 0.3411838412284851, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 1040 + }, + { + "epoch": 0.7844602166604407, + "grad_norm": 0.30608129501342773, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 1050 + }, + { + "epoch": 0.7919312663429212, + "grad_norm": 0.30899080634117126, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 1060 + }, + { + "epoch": 0.7994023160254016, + "grad_norm": 0.3160453140735626, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 1070 + }, + { + "epoch": 0.8068733657078819, + "grad_norm": 0.30947187542915344, + "learning_rate": 0.0002, + "loss": 1.8452, + "step": 1080 + }, + { + "epoch": 0.8143444153903624, + "grad_norm": 0.3103134036064148, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1090 + }, + { + "epoch": 0.8218154650728428, + "grad_norm": 0.31771138310432434, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 1100 + }, + { + "epoch": 0.8292865147553231, + "grad_norm": 0.5860997438430786, + "learning_rate": 0.0002, + "loss": 1.7918, + "step": 1110 + }, + { + "epoch": 0.8367575644378035, + "grad_norm": 0.3230148255825043, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 1120 + }, + { + "epoch": 0.8442286141202839, + "grad_norm": 0.29611510038375854, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 1130 + }, + { + "epoch": 0.8516996638027643, + "grad_norm": 0.3373654782772064, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 1140 + }, + { + "epoch": 0.8591707134852447, + "grad_norm": 0.3474279046058655, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1150 + }, + { + "epoch": 0.866641763167725, + "grad_norm": 0.35057875514030457, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1160 + }, + { + "epoch": 0.8741128128502055, + "grad_norm": 0.39537495374679565, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 1170 + }, + { + "epoch": 0.8815838625326858, + "grad_norm": 0.3714233636856079, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1180 + }, + { + "epoch": 0.8890549122151662, + "grad_norm": 0.2950296998023987, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1190 + }, + { + "epoch": 0.8965259618976467, + "grad_norm": 0.38182979822158813, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 1200 + }, + { + "epoch": 0.903997011580127, + "grad_norm": 0.27883678674697876, + "learning_rate": 0.0002, + "loss": 1.827, + "step": 1210 + }, + { + "epoch": 0.9114680612626074, + "grad_norm": 0.33874374628067017, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1220 + }, + { + "epoch": 0.9189391109450877, + "grad_norm": 0.3014272153377533, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1230 + }, + { + "epoch": 0.9264101606275682, + "grad_norm": 0.3194271922111511, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 1240 + }, + { + "epoch": 0.9338812103100486, + "grad_norm": 0.3049403429031372, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1250 + }, + { + "epoch": 0.9413522599925289, + "grad_norm": 0.30621254444122314, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 1260 + }, + { + "epoch": 0.9488233096750094, + "grad_norm": 0.28675132989883423, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 1270 + }, + { + "epoch": 0.9562943593574897, + "grad_norm": 0.3322032690048218, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1280 + }, + { + "epoch": 0.9637654090399701, + "grad_norm": 0.35408294200897217, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1290 + }, + { + "epoch": 0.9712364587224505, + "grad_norm": 0.36386919021606445, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1300 + }, + { + "epoch": 0.9787075084049309, + "grad_norm": 0.32338324189186096, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 1310 + }, + { + "epoch": 0.9861785580874113, + "grad_norm": 0.3714013993740082, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 1320 + }, + { + "epoch": 0.9936496077698916, + "grad_norm": 0.3133082389831543, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 1330 + }, + { + "epoch": 0.9996264475158759, + "eval_loss": 1.8051470518112183, + "eval_runtime": 38.6332, + "eval_samples_per_second": 13.331, + "eval_steps_per_second": 1.682, + "step": 1338 + }, + { + "epoch": 1.001120657452372, + "grad_norm": 0.31595754623413086, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 1340 + }, + { + "epoch": 1.0085917071348525, + "grad_norm": 0.3095700144767761, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1350 + }, + { + "epoch": 1.0160627568173328, + "grad_norm": 0.34677496552467346, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1360 + }, + { + "epoch": 1.0235338064998132, + "grad_norm": 0.29108840227127075, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1370 + }, + { + "epoch": 1.0310048561822935, + "grad_norm": 0.32356950640678406, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 1380 + }, + { + "epoch": 1.038475905864774, + "grad_norm": 0.4200669229030609, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1390 + }, + { + "epoch": 1.0459469555472545, + "grad_norm": 0.3283711373806, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 1400 + }, + { + "epoch": 1.0534180052297348, + "grad_norm": 0.32898256182670593, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1410 + }, + { + "epoch": 1.0608890549122152, + "grad_norm": 0.38790300488471985, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 1420 + }, + { + "epoch": 1.0683601045946955, + "grad_norm": 0.339800089597702, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1430 + }, + { + "epoch": 1.075831154277176, + "grad_norm": 0.3548751175403595, + "learning_rate": 0.0002, + "loss": 1.7076, + "step": 1440 + }, + { + "epoch": 1.0833022039596563, + "grad_norm": 0.35114359855651855, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1450 + }, + { + "epoch": 1.0907732536421366, + "grad_norm": 0.35226720571517944, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 1460 + }, + { + "epoch": 1.0982443033246172, + "grad_norm": 0.33665576577186584, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 1470 + }, + { + "epoch": 1.1057153530070976, + "grad_norm": 0.363889217376709, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1480 + }, + { + "epoch": 1.113186402689578, + "grad_norm": 0.3826201856136322, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 1490 + }, + { + "epoch": 1.1206574523720583, + "grad_norm": 0.34058740735054016, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 1500 + }, + { + "epoch": 1.1281285020545386, + "grad_norm": 0.3462134301662445, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1510 + }, + { + "epoch": 1.135599551737019, + "grad_norm": 0.3396756052970886, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 1520 + }, + { + "epoch": 1.1430706014194993, + "grad_norm": 0.32004743814468384, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1530 + }, + { + "epoch": 1.15054165110198, + "grad_norm": 0.3397733271121979, + "learning_rate": 0.0002, + "loss": 1.743, + "step": 1540 + }, + { + "epoch": 1.1580127007844603, + "grad_norm": 0.3783262073993683, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 1550 + }, + { + "epoch": 1.1654837504669406, + "grad_norm": 0.35121291875839233, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1560 + }, + { + "epoch": 1.172954800149421, + "grad_norm": 0.35816895961761475, + "learning_rate": 0.0002, + "loss": 1.678, + "step": 1570 + }, + { + "epoch": 1.1804258498319014, + "grad_norm": 0.33843839168548584, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1580 + }, + { + "epoch": 1.1878968995143817, + "grad_norm": 0.3371972143650055, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 1590 + }, + { + "epoch": 1.195367949196862, + "grad_norm": 0.36016878485679626, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 1600 + }, + { + "epoch": 1.2028389988793426, + "grad_norm": 0.40879473090171814, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 1610 + }, + { + "epoch": 1.210310048561823, + "grad_norm": 0.3216715455055237, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 1620 + }, + { + "epoch": 1.2177810982443034, + "grad_norm": 0.4482610821723938, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1630 + }, + { + "epoch": 1.2252521479267837, + "grad_norm": 0.3257700502872467, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1640 + }, + { + "epoch": 1.232723197609264, + "grad_norm": 0.38646459579467773, + "learning_rate": 0.0002, + "loss": 1.7177, + "step": 1650 + }, + { + "epoch": 1.2401942472917444, + "grad_norm": 0.4081360697746277, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1660 + }, + { + "epoch": 1.2476652969742248, + "grad_norm": 0.4326848089694977, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 1670 + }, + { + "epoch": 1.2551363466567054, + "grad_norm": 0.346401572227478, + "learning_rate": 0.0002, + "loss": 1.6752, + "step": 1680 + }, + { + "epoch": 1.2626073963391857, + "grad_norm": 0.34536251425743103, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1690 + }, + { + "epoch": 1.270078446021666, + "grad_norm": 0.41359591484069824, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 1700 + }, + { + "epoch": 1.2775494957041464, + "grad_norm": 0.3530874252319336, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 1710 + }, + { + "epoch": 1.2850205453866268, + "grad_norm": 0.3702719211578369, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 1720 + }, + { + "epoch": 1.2924915950691072, + "grad_norm": 0.3703329563140869, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1730 + }, + { + "epoch": 1.2999626447515875, + "grad_norm": 0.37919729948043823, + "learning_rate": 0.0002, + "loss": 1.7221, + "step": 1740 + }, + { + "epoch": 1.307433694434068, + "grad_norm": 0.32526856660842896, + "learning_rate": 0.0002, + "loss": 1.7859, + "step": 1750 + }, + { + "epoch": 1.3149047441165485, + "grad_norm": 0.36752620339393616, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1760 + }, + { + "epoch": 1.3223757937990288, + "grad_norm": 0.3398192524909973, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1770 + }, + { + "epoch": 1.3298468434815092, + "grad_norm": 0.37435585260391235, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1780 + }, + { + "epoch": 1.3373178931639895, + "grad_norm": 0.35793280601501465, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1790 + }, + { + "epoch": 1.3447889428464699, + "grad_norm": 0.35481882095336914, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1800 + }, + { + "epoch": 1.3522599925289502, + "grad_norm": 0.3786393105983734, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1810 + }, + { + "epoch": 1.3597310422114308, + "grad_norm": 0.33245593309402466, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1820 + }, + { + "epoch": 1.3672020918939112, + "grad_norm": 0.35388344526290894, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1830 + }, + { + "epoch": 1.3746731415763915, + "grad_norm": 0.3695325553417206, + "learning_rate": 0.0002, + "loss": 1.6968, + "step": 1840 + }, + { + "epoch": 1.382144191258872, + "grad_norm": 0.3683604598045349, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1850 + }, + { + "epoch": 1.3896152409413522, + "grad_norm": 0.3753012418746948, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1860 + }, + { + "epoch": 1.3970862906238326, + "grad_norm": 0.3331069350242615, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 1870 + }, + { + "epoch": 1.404557340306313, + "grad_norm": 0.3877500295639038, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 1880 + }, + { + "epoch": 1.4120283899887935, + "grad_norm": 0.33525151014328003, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1890 + }, + { + "epoch": 1.4194994396712737, + "grad_norm": 0.3697299659252167, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1900 + }, + { + "epoch": 1.4269704893537543, + "grad_norm": 0.4029286205768585, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1910 + }, + { + "epoch": 1.4344415390362346, + "grad_norm": 0.3596203029155731, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 1920 + }, + { + "epoch": 1.441912588718715, + "grad_norm": 0.450783908367157, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 1930 + }, + { + "epoch": 1.4493836384011953, + "grad_norm": 0.3651481866836548, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1940 + }, + { + "epoch": 1.4568546880836757, + "grad_norm": 0.3608424663543701, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 1950 + }, + { + "epoch": 1.4643257377661563, + "grad_norm": 0.39684420824050903, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 1960 + }, + { + "epoch": 1.4717967874486364, + "grad_norm": 0.34618663787841797, + "learning_rate": 0.0002, + "loss": 1.7514, + "step": 1970 + }, + { + "epoch": 1.479267837131117, + "grad_norm": 0.4150386452674866, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1980 + }, + { + "epoch": 1.4867388868135973, + "grad_norm": 0.35500776767730713, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1990 + }, + { + "epoch": 1.4942099364960777, + "grad_norm": 0.344144344329834, + "learning_rate": 0.0002, + "loss": 1.7322, + "step": 2000 + }, + { + "epoch": 1.501680986178558, + "grad_norm": 0.3340149223804474, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2010 + }, + { + "epoch": 1.5091520358610384, + "grad_norm": 0.37685006856918335, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 2020 + }, + { + "epoch": 1.516623085543519, + "grad_norm": 0.3699876368045807, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 2030 + }, + { + "epoch": 1.5240941352259991, + "grad_norm": 0.3370307385921478, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 2040 + }, + { + "epoch": 1.5315651849084797, + "grad_norm": 0.37780630588531494, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 2050 + }, + { + "epoch": 1.53903623459096, + "grad_norm": 0.370259165763855, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 2060 + }, + { + "epoch": 1.5465072842734404, + "grad_norm": 0.3440011441707611, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 2070 + }, + { + "epoch": 1.5539783339559208, + "grad_norm": 0.40382063388824463, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 2080 + }, + { + "epoch": 1.5614493836384011, + "grad_norm": 0.38002029061317444, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 2090 + }, + { + "epoch": 1.5689204333208817, + "grad_norm": 0.3658451437950134, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2100 + }, + { + "epoch": 1.5763914830033618, + "grad_norm": 0.354842871427536, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 2110 + }, + { + "epoch": 1.5838625326858424, + "grad_norm": 0.34735530614852905, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 2120 + }, + { + "epoch": 1.5913335823683228, + "grad_norm": 0.377581924200058, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 2130 + }, + { + "epoch": 1.5988046320508031, + "grad_norm": 0.41254034638404846, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 2140 + }, + { + "epoch": 1.6062756817332835, + "grad_norm": 0.3630715310573578, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2150 + }, + { + "epoch": 1.6137467314157639, + "grad_norm": 0.36980143189430237, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 2160 + }, + { + "epoch": 1.6212177810982444, + "grad_norm": 0.3634769320487976, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2170 + }, + { + "epoch": 1.6286888307807246, + "grad_norm": 0.3794139623641968, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2180 + }, + { + "epoch": 1.6361598804632052, + "grad_norm": 0.359742134809494, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 2190 + }, + { + "epoch": 1.6436309301456855, + "grad_norm": 0.3770543932914734, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.6511019798281659, + "grad_norm": 0.3797036409378052, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 2210 + }, + { + "epoch": 1.6585730295106462, + "grad_norm": 0.35622093081474304, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 2220 + }, + { + "epoch": 1.6660440791931266, + "grad_norm": 0.34552520513534546, + "learning_rate": 0.0002, + "loss": 1.6615, + "step": 2230 + }, + { + "epoch": 1.6735151288756072, + "grad_norm": 0.379926860332489, + "learning_rate": 0.0002, + "loss": 1.7522, + "step": 2240 + }, + { + "epoch": 1.6809861785580873, + "grad_norm": 0.37083810567855835, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 2250 + }, + { + "epoch": 1.6884572282405679, + "grad_norm": 0.42746543884277344, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 2260 + }, + { + "epoch": 1.6959282779230482, + "grad_norm": 0.3372884690761566, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2270 + }, + { + "epoch": 1.7033993276055286, + "grad_norm": 0.35220256447792053, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2280 + }, + { + "epoch": 1.710870377288009, + "grad_norm": 0.3659130930900574, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 2290 + }, + { + "epoch": 1.7183414269704893, + "grad_norm": 0.37629297375679016, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2300 + }, + { + "epoch": 1.7258124766529699, + "grad_norm": 0.36312398314476013, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2310 + }, + { + "epoch": 1.73328352633545, + "grad_norm": 0.467709481716156, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 2320 + }, + { + "epoch": 1.7407545760179306, + "grad_norm": 0.38685527443885803, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2330 + }, + { + "epoch": 1.748225625700411, + "grad_norm": 0.3578338325023651, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 2340 + }, + { + "epoch": 1.7556966753828913, + "grad_norm": 0.36057502031326294, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2350 + }, + { + "epoch": 1.7631677250653717, + "grad_norm": 0.3615196645259857, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2360 + }, + { + "epoch": 1.770638774747852, + "grad_norm": 0.4118947684764862, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 2370 + }, + { + "epoch": 1.7781098244303326, + "grad_norm": 0.4067276120185852, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2380 + }, + { + "epoch": 1.7855808741128127, + "grad_norm": 0.3979823887348175, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2390 + }, + { + "epoch": 1.7930519237952933, + "grad_norm": 0.44045883417129517, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 2400 + }, + { + "epoch": 1.8005229734777737, + "grad_norm": 0.3998069167137146, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 2410 + }, + { + "epoch": 1.807994023160254, + "grad_norm": 0.3450094759464264, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 2420 + }, + { + "epoch": 1.8154650728427344, + "grad_norm": 0.3759009838104248, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2430 + }, + { + "epoch": 1.8229361225252148, + "grad_norm": 0.34347015619277954, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2440 + }, + { + "epoch": 1.8304071722076953, + "grad_norm": 0.3511228859424591, + "learning_rate": 0.0002, + "loss": 1.7345, + "step": 2450 + }, + { + "epoch": 1.8378782218901755, + "grad_norm": 0.36853715777397156, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 2460 + }, + { + "epoch": 1.845349271572656, + "grad_norm": 0.40659376978874207, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2470 + }, + { + "epoch": 1.8528203212551362, + "grad_norm": 0.39621320366859436, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 2480 + }, + { + "epoch": 1.8602913709376168, + "grad_norm": 0.3753979504108429, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 2490 + }, + { + "epoch": 1.8677624206200971, + "grad_norm": 0.3811938464641571, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2500 + }, + { + "epoch": 1.8752334703025775, + "grad_norm": 0.3432596027851105, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 2510 + }, + { + "epoch": 1.882704519985058, + "grad_norm": 0.3670712113380432, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 2520 + }, + { + "epoch": 1.8901755696675382, + "grad_norm": 0.40907177329063416, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2530 + }, + { + "epoch": 1.8976466193500188, + "grad_norm": 0.3821999728679657, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 2540 + }, + { + "epoch": 1.905117669032499, + "grad_norm": 0.36173978447914124, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 2550 + }, + { + "epoch": 1.9125887187149795, + "grad_norm": 0.38990336656570435, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 2560 + }, + { + "epoch": 1.9200597683974598, + "grad_norm": 0.35242322087287903, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 2570 + }, + { + "epoch": 1.9275308180799402, + "grad_norm": 0.3506428003311157, + "learning_rate": 0.0002, + "loss": 1.7268, + "step": 2580 + }, + { + "epoch": 1.9350018677624208, + "grad_norm": 0.39540135860443115, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2590 + }, + { + "epoch": 1.942472917444901, + "grad_norm": 0.3444725573062897, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2600 + }, + { + "epoch": 1.9499439671273815, + "grad_norm": 0.3963521718978882, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 2610 + }, + { + "epoch": 1.9574150168098616, + "grad_norm": 0.3689815402030945, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2620 + }, + { + "epoch": 1.9648860664923422, + "grad_norm": 0.3482626676559448, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 2630 + }, + { + "epoch": 1.9723571161748226, + "grad_norm": 0.35832616686820984, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2640 + }, + { + "epoch": 1.979828165857303, + "grad_norm": 0.4776208996772766, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2650 + }, + { + "epoch": 1.9872992155397835, + "grad_norm": 0.32570165395736694, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2660 + }, + { + "epoch": 1.9947702652222636, + "grad_norm": 0.3380725085735321, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2670 + }, + { + "epoch": 2.0, + "eval_loss": 1.8046749830245972, + "eval_runtime": 38.5096, + "eval_samples_per_second": 13.373, + "eval_steps_per_second": 1.688, + "step": 2677 + }, + { + "epoch": 2.002241314904744, + "grad_norm": 0.36817631125450134, + "learning_rate": 0.0002, + "loss": 1.7265, + "step": 2680 + }, + { + "epoch": 2.0097123645872244, + "grad_norm": 0.4056456685066223, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2690 + }, + { + "epoch": 2.017183414269705, + "grad_norm": 0.37416863441467285, + "learning_rate": 0.0002, + "loss": 1.5515, + "step": 2700 + }, + { + "epoch": 2.024654463952185, + "grad_norm": 0.4273638427257538, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2710 + }, + { + "epoch": 2.0321255136346656, + "grad_norm": 0.36497923731803894, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2720 + }, + { + "epoch": 2.0395965633171462, + "grad_norm": 0.5021994113922119, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 2730 + }, + { + "epoch": 2.0470676129996264, + "grad_norm": 0.45896220207214355, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 2740 + }, + { + "epoch": 2.054538662682107, + "grad_norm": 0.3973815143108368, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 2750 + }, + { + "epoch": 2.062009712364587, + "grad_norm": 0.4521815776824951, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2760 + }, + { + "epoch": 2.0694807620470677, + "grad_norm": 0.42775002121925354, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2770 + }, + { + "epoch": 2.076951811729548, + "grad_norm": 0.48158586025238037, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 2780 + }, + { + "epoch": 2.0844228614120284, + "grad_norm": 0.4612371623516083, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2790 + }, + { + "epoch": 2.091893911094509, + "grad_norm": 0.42536866664886475, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 2800 + }, + { + "epoch": 2.099364960776989, + "grad_norm": 0.48515772819519043, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 2810 + }, + { + "epoch": 2.1068360104594697, + "grad_norm": 0.41418662667274475, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2820 + }, + { + "epoch": 2.11430706014195, + "grad_norm": 0.4683697819709778, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2830 + }, + { + "epoch": 2.1217781098244304, + "grad_norm": 0.4484657049179077, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2840 + }, + { + "epoch": 2.1292491595069105, + "grad_norm": 0.6621400713920593, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 2850 + }, + { + "epoch": 2.136720209189391, + "grad_norm": 0.45074811577796936, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 2860 + }, + { + "epoch": 2.1441912588718717, + "grad_norm": 0.3513113558292389, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2870 + }, + { + "epoch": 2.151662308554352, + "grad_norm": 0.40411314368247986, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 2880 + }, + { + "epoch": 2.1591333582368324, + "grad_norm": 0.4121065139770508, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 2890 + }, + { + "epoch": 2.1666044079193125, + "grad_norm": 0.44723689556121826, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 2900 + }, + { + "epoch": 2.174075457601793, + "grad_norm": 0.4226122498512268, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 2910 + }, + { + "epoch": 2.1815465072842732, + "grad_norm": 0.46617650985717773, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2920 + }, + { + "epoch": 2.189017556966754, + "grad_norm": 0.4506422281265259, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 2930 + }, + { + "epoch": 2.1964886066492344, + "grad_norm": 0.4892672896385193, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2940 + }, + { + "epoch": 2.2039596563317145, + "grad_norm": 0.44095516204833984, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2950 + }, + { + "epoch": 2.211430706014195, + "grad_norm": 0.41522109508514404, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 2960 + }, + { + "epoch": 2.2189017556966752, + "grad_norm": 0.4860858917236328, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2970 + }, + { + "epoch": 2.226372805379156, + "grad_norm": 0.42662516236305237, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2980 + }, + { + "epoch": 2.233843855061636, + "grad_norm": 0.4390648305416107, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2990 + }, + { + "epoch": 2.2413149047441165, + "grad_norm": 0.47515565156936646, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 3000 + }, + { + "epoch": 2.248785954426597, + "grad_norm": 0.4104543924331665, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 3010 + }, + { + "epoch": 2.2562570041090773, + "grad_norm": 0.4404028654098511, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 3020 + }, + { + "epoch": 2.263728053791558, + "grad_norm": 0.4717366695404053, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3030 + }, + { + "epoch": 2.271199103474038, + "grad_norm": 0.48345857858657837, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 3040 + }, + { + "epoch": 2.2786701531565186, + "grad_norm": 0.5312452912330627, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 3050 + }, + { + "epoch": 2.2861412028389987, + "grad_norm": 0.5073099732398987, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 3060 + }, + { + "epoch": 2.2936122525214793, + "grad_norm": 0.5027463436126709, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 3070 + }, + { + "epoch": 2.30108330220396, + "grad_norm": 0.5436304807662964, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3080 + }, + { + "epoch": 2.30855435188644, + "grad_norm": 0.4701065123081207, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 3090 + }, + { + "epoch": 2.3160254015689206, + "grad_norm": 0.46988746523857117, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 3100 + }, + { + "epoch": 2.3234964512514007, + "grad_norm": 0.45112869143486023, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 3110 + }, + { + "epoch": 2.3309675009338813, + "grad_norm": 0.5173566937446594, + "learning_rate": 0.0002, + "loss": 1.6291, + "step": 3120 + }, + { + "epoch": 2.3384385506163614, + "grad_norm": 0.40345850586891174, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 3130 + }, + { + "epoch": 2.345909600298842, + "grad_norm": 0.4218924939632416, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3140 + }, + { + "epoch": 2.3533806499813226, + "grad_norm": 0.41857317090034485, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 3150 + }, + { + "epoch": 2.3608516996638027, + "grad_norm": 0.4197218418121338, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 3160 + }, + { + "epoch": 2.3683227493462833, + "grad_norm": 0.4260677397251129, + "learning_rate": 0.0002, + "loss": 1.6572, + "step": 3170 + }, + { + "epoch": 2.3757937990287634, + "grad_norm": 0.4209042191505432, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3180 + }, + { + "epoch": 2.383264848711244, + "grad_norm": 0.4092234969139099, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3190 + }, + { + "epoch": 2.390735898393724, + "grad_norm": 0.4928431510925293, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 3200 + }, + { + "epoch": 2.3982069480762047, + "grad_norm": 0.49252402782440186, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3210 + }, + { + "epoch": 2.4056779977586853, + "grad_norm": 0.4368397295475006, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3220 + }, + { + "epoch": 2.4131490474411654, + "grad_norm": 0.46122390031814575, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 3230 + }, + { + "epoch": 2.420620097123646, + "grad_norm": 0.4272301197052002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.428091146806126, + "grad_norm": 0.41480937600135803, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 3250 + }, + { + "epoch": 2.4355621964886067, + "grad_norm": 0.48911941051483154, + "learning_rate": 0.0002, + "loss": 1.6281, + "step": 3260 + }, + { + "epoch": 2.443033246171087, + "grad_norm": 0.4444098472595215, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 3270 + }, + { + "epoch": 2.4505042958535674, + "grad_norm": 0.5111684799194336, + "learning_rate": 0.0002, + "loss": 1.6961, + "step": 3280 + }, + { + "epoch": 2.457975345536048, + "grad_norm": 0.5058825016021729, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 3290 + }, + { + "epoch": 2.465446395218528, + "grad_norm": 0.44173210859298706, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3300 + }, + { + "epoch": 2.4729174449010087, + "grad_norm": 0.4659745991230011, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 3310 + }, + { + "epoch": 2.480388494583489, + "grad_norm": 0.47237497568130493, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3320 + }, + { + "epoch": 2.4878595442659694, + "grad_norm": 0.47303131222724915, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 3330 + }, + { + "epoch": 2.4953305939484496, + "grad_norm": 0.4522389769554138, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 3340 + }, + { + "epoch": 2.50280164363093, + "grad_norm": 0.4467332363128662, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3350 + }, + { + "epoch": 2.5102726933134107, + "grad_norm": 0.4413762092590332, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3360 + }, + { + "epoch": 2.517743742995891, + "grad_norm": 0.495514452457428, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 3370 + }, + { + "epoch": 2.5252147926783715, + "grad_norm": 0.4429773986339569, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 3380 + }, + { + "epoch": 2.5326858423608516, + "grad_norm": 0.4589079022407532, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3390 + }, + { + "epoch": 2.540156892043332, + "grad_norm": 0.4683997333049774, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 3400 + }, + { + "epoch": 2.5476279417258123, + "grad_norm": 0.4651731252670288, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 3410 + }, + { + "epoch": 2.555098991408293, + "grad_norm": 0.45818084478378296, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3420 + }, + { + "epoch": 2.5625700410907735, + "grad_norm": 0.45209529995918274, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.5700410907732536, + "grad_norm": 0.4344733655452728, + "learning_rate": 0.0002, + "loss": 1.5606, + "step": 3440 + }, + { + "epoch": 2.577512140455734, + "grad_norm": 0.47435566782951355, + "learning_rate": 0.0002, + "loss": 1.6748, + "step": 3450 + }, + { + "epoch": 2.5849831901382143, + "grad_norm": 0.43841999769210815, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 3460 + }, + { + "epoch": 2.592454239820695, + "grad_norm": 0.4323869049549103, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 3470 + }, + { + "epoch": 2.599925289503175, + "grad_norm": 0.44355881214141846, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 3480 + }, + { + "epoch": 2.6073963391856556, + "grad_norm": 0.45847779512405396, + "learning_rate": 0.0002, + "loss": 1.665, + "step": 3490 + }, + { + "epoch": 2.614867388868136, + "grad_norm": 0.4411061704158783, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3500 + }, + { + "epoch": 2.6223384385506163, + "grad_norm": 0.4446796178817749, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3510 + }, + { + "epoch": 2.629809488233097, + "grad_norm": 0.41969653964042664, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3520 + }, + { + "epoch": 2.637280537915577, + "grad_norm": 0.5263747572898865, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 3530 + }, + { + "epoch": 2.6447515875980576, + "grad_norm": 0.47719451785087585, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3540 + }, + { + "epoch": 2.6522226372805378, + "grad_norm": 0.46574118733406067, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 3550 + }, + { + "epoch": 2.6596936869630183, + "grad_norm": 0.46867135167121887, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 3560 + }, + { + "epoch": 2.667164736645499, + "grad_norm": 0.4441198706626892, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3570 + }, + { + "epoch": 2.674635786327979, + "grad_norm": 0.4871319830417633, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3580 + }, + { + "epoch": 2.6821068360104596, + "grad_norm": 0.43900373578071594, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 3590 + }, + { + "epoch": 2.6895778856929398, + "grad_norm": 0.42509549856185913, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 3600 + }, + { + "epoch": 2.6970489353754203, + "grad_norm": 0.4691086709499359, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 3610 + }, + { + "epoch": 2.7045199850579005, + "grad_norm": 0.46318942308425903, + "learning_rate": 0.0002, + "loss": 1.5491, + "step": 3620 + }, + { + "epoch": 2.711991034740381, + "grad_norm": 0.44631096720695496, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3630 + }, + { + "epoch": 2.7194620844228616, + "grad_norm": 0.42315489053726196, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3640 + }, + { + "epoch": 2.7269331341053418, + "grad_norm": 0.4971241056919098, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3650 + }, + { + "epoch": 2.7344041837878224, + "grad_norm": 0.4578486382961273, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 3660 + }, + { + "epoch": 2.7418752334703025, + "grad_norm": 0.46584776043891907, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3670 + }, + { + "epoch": 2.749346283152783, + "grad_norm": 0.4951731264591217, + "learning_rate": 0.0002, + "loss": 1.6809, + "step": 3680 + }, + { + "epoch": 2.756817332835263, + "grad_norm": 0.4935225546360016, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 3690 + }, + { + "epoch": 2.764288382517744, + "grad_norm": 0.41805586218833923, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3700 + }, + { + "epoch": 2.7717594322002244, + "grad_norm": 0.4417555630207062, + "learning_rate": 0.0002, + "loss": 1.7173, + "step": 3710 + }, + { + "epoch": 2.7792304818827045, + "grad_norm": 0.48229655623435974, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 3720 + }, + { + "epoch": 2.786701531565185, + "grad_norm": 0.48562315106391907, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3730 + }, + { + "epoch": 2.794172581247665, + "grad_norm": 0.4473940432071686, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 3740 + }, + { + "epoch": 2.801643630930146, + "grad_norm": 0.4626813232898712, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3750 + }, + { + "epoch": 2.809114680612626, + "grad_norm": 0.4339792728424072, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 3760 + }, + { + "epoch": 2.8165857302951065, + "grad_norm": 0.5250858068466187, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 3770 + }, + { + "epoch": 2.824056779977587, + "grad_norm": 0.4537523090839386, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3780 + }, + { + "epoch": 2.831527829660067, + "grad_norm": 0.5646113157272339, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3790 + }, + { + "epoch": 2.8389988793425474, + "grad_norm": 0.44243332743644714, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 3800 + }, + { + "epoch": 2.846469929025028, + "grad_norm": 0.4585791826248169, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3810 + }, + { + "epoch": 2.8539409787075085, + "grad_norm": 0.489702045917511, + "learning_rate": 0.0002, + "loss": 1.6854, + "step": 3820 + }, + { + "epoch": 2.8614120283899886, + "grad_norm": 0.502470850944519, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 3830 + }, + { + "epoch": 2.8688830780724692, + "grad_norm": 0.4395960867404938, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3840 + }, + { + "epoch": 2.87635412775495, + "grad_norm": 0.4348670244216919, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3850 + }, + { + "epoch": 2.88382517743743, + "grad_norm": 0.48852720856666565, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3860 + }, + { + "epoch": 2.89129622711991, + "grad_norm": 0.45317450165748596, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3870 + }, + { + "epoch": 2.8987672768023907, + "grad_norm": 0.4732758700847626, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3880 + }, + { + "epoch": 2.9062383264848712, + "grad_norm": 0.45238012075424194, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3890 + }, + { + "epoch": 2.9137093761673514, + "grad_norm": 0.48838064074516296, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 3900 + }, + { + "epoch": 2.921180425849832, + "grad_norm": 0.43496349453926086, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 3910 + }, + { + "epoch": 2.9286514755323125, + "grad_norm": 0.47963935136795044, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 3920 + }, + { + "epoch": 2.9361225252147927, + "grad_norm": 0.4544987976551056, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 3930 + }, + { + "epoch": 2.943593574897273, + "grad_norm": 0.4622892141342163, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 3940 + }, + { + "epoch": 2.9510646245797534, + "grad_norm": 0.47026222944259644, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 3950 + }, + { + "epoch": 2.958535674262234, + "grad_norm": 0.4549552798271179, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 3960 + }, + { + "epoch": 2.966006723944714, + "grad_norm": 0.46647515892982483, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3970 + }, + { + "epoch": 2.9734777736271947, + "grad_norm": 0.45095112919807434, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 3980 + }, + { + "epoch": 2.9809488233096753, + "grad_norm": 0.4690017104148865, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 3990 + }, + { + "epoch": 2.9884198729921554, + "grad_norm": 0.4603444039821625, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 4000 + }, + { + "epoch": 2.9958909226746355, + "grad_norm": 0.4743294417858124, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 4010 + }, + { + "epoch": 2.999626447515876, + "eval_loss": 1.8252571821212769, + "eval_runtime": 38.7853, + "eval_samples_per_second": 13.278, + "eval_steps_per_second": 1.676, + "step": 4015 + }, + { + "epoch": 3.003361972357116, + "grad_norm": 0.4919724464416504, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 4020 + }, + { + "epoch": 3.0108330220395967, + "grad_norm": 0.4747185707092285, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4030 + }, + { + "epoch": 3.018304071722077, + "grad_norm": 0.4797595143318176, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 4040 + }, + { + "epoch": 3.0257751214045574, + "grad_norm": 0.5450999140739441, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 4050 + }, + { + "epoch": 3.0332461710870375, + "grad_norm": 0.49058812856674194, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4060 + }, + { + "epoch": 3.040717220769518, + "grad_norm": 0.5219563841819763, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4070 + }, + { + "epoch": 3.0481882704519987, + "grad_norm": 0.515628457069397, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 4080 + }, + { + "epoch": 3.055659320134479, + "grad_norm": 0.6145984530448914, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 4090 + }, + { + "epoch": 3.0631303698169594, + "grad_norm": 0.6067144274711609, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 4100 + }, + { + "epoch": 3.0706014194994395, + "grad_norm": 0.5773133039474487, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4110 + }, + { + "epoch": 3.07807246918192, + "grad_norm": 0.6894241571426392, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 4120 + }, + { + "epoch": 3.0855435188644003, + "grad_norm": 0.6422514915466309, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4130 + }, + { + "epoch": 3.093014568546881, + "grad_norm": 0.6119855046272278, + "learning_rate": 0.0002, + "loss": 1.4724, + "step": 4140 + }, + { + "epoch": 3.1004856182293614, + "grad_norm": 0.5847280025482178, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 4150 + }, + { + "epoch": 3.1079566679118416, + "grad_norm": 0.5401515960693359, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4160 + }, + { + "epoch": 3.115427717594322, + "grad_norm": 0.6501587629318237, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 4170 + }, + { + "epoch": 3.1228987672768023, + "grad_norm": 0.5988039374351501, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 4180 + }, + { + "epoch": 3.130369816959283, + "grad_norm": 0.4982665181159973, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 4190 + }, + { + "epoch": 3.137840866641763, + "grad_norm": 0.5548039078712463, + "learning_rate": 0.0002, + "loss": 1.5078, + "step": 4200 + }, + { + "epoch": 3.1453119163242436, + "grad_norm": 0.5920777320861816, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 4210 + }, + { + "epoch": 3.152782966006724, + "grad_norm": 0.6965190172195435, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 4220 + }, + { + "epoch": 3.1602540156892043, + "grad_norm": 0.5196244716644287, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4230 + }, + { + "epoch": 3.167725065371685, + "grad_norm": 0.6942682266235352, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 4240 + }, + { + "epoch": 3.175196115054165, + "grad_norm": 0.5765156149864197, + "learning_rate": 0.0002, + "loss": 1.5407, + "step": 4250 + }, + { + "epoch": 3.1826671647366456, + "grad_norm": 0.5801976919174194, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 4260 + }, + { + "epoch": 3.1901382144191257, + "grad_norm": 0.6260752081871033, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4270 + }, + { + "epoch": 3.1976092641016063, + "grad_norm": 0.6610770225524902, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 4280 + }, + { + "epoch": 3.205080313784087, + "grad_norm": 0.5762143135070801, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 4290 + }, + { + "epoch": 3.212551363466567, + "grad_norm": 0.5926990509033203, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 4300 + }, + { + "epoch": 3.2200224131490476, + "grad_norm": 0.7373854517936707, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 4310 + }, + { + "epoch": 3.2274934628315277, + "grad_norm": 0.5963311195373535, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 4320 + }, + { + "epoch": 3.2349645125140083, + "grad_norm": 0.5754616856575012, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 4330 + }, + { + "epoch": 3.2424355621964884, + "grad_norm": 0.6116095781326294, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 4340 + }, + { + "epoch": 3.249906611878969, + "grad_norm": 0.6001536846160889, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 4350 + }, + { + "epoch": 3.257377661561449, + "grad_norm": 0.5270227789878845, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 4360 + }, + { + "epoch": 3.2648487112439297, + "grad_norm": 0.6666602492332458, + "learning_rate": 0.0002, + "loss": 1.5235, + "step": 4370 + }, + { + "epoch": 3.2723197609264103, + "grad_norm": 0.520310640335083, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 4380 + }, + { + "epoch": 3.2797908106088904, + "grad_norm": 0.5165975093841553, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 4390 + }, + { + "epoch": 3.287261860291371, + "grad_norm": 0.6080228686332703, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4400 + }, + { + "epoch": 3.294732909973851, + "grad_norm": 0.670122504234314, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 4410 + }, + { + "epoch": 3.3022039596563317, + "grad_norm": 0.6019457578659058, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 4420 + }, + { + "epoch": 3.309675009338812, + "grad_norm": 0.5519300103187561, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 4430 + }, + { + "epoch": 3.3171460590212924, + "grad_norm": 0.5958521962165833, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 4440 + }, + { + "epoch": 3.324617108703773, + "grad_norm": 0.5552705526351929, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4450 + }, + { + "epoch": 3.332088158386253, + "grad_norm": 0.6583784818649292, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 4460 + }, + { + "epoch": 3.3395592080687337, + "grad_norm": 0.5815939903259277, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4470 + }, + { + "epoch": 3.347030257751214, + "grad_norm": 1.3342205286026, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 4480 + }, + { + "epoch": 3.3545013074336945, + "grad_norm": 0.6341500878334045, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 4490 + }, + { + "epoch": 3.3619723571161746, + "grad_norm": 0.6384079456329346, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 4500 + }, + { + "epoch": 3.369443406798655, + "grad_norm": 0.6098346710205078, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 4510 + }, + { + "epoch": 3.3769144564811358, + "grad_norm": 0.5958296656608582, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4520 + }, + { + "epoch": 3.384385506163616, + "grad_norm": 0.6157881617546082, + "learning_rate": 0.0002, + "loss": 1.5171, + "step": 4530 + }, + { + "epoch": 3.3918565558460965, + "grad_norm": 0.5671007037162781, + "learning_rate": 0.0002, + "loss": 1.569, + "step": 4540 + }, + { + "epoch": 3.3993276055285766, + "grad_norm": 0.6203294992446899, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 4550 + }, + { + "epoch": 3.406798655211057, + "grad_norm": 0.6743317246437073, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 4560 + }, + { + "epoch": 3.4142697048935373, + "grad_norm": 0.731765627861023, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4570 + }, + { + "epoch": 3.421740754576018, + "grad_norm": 0.6285187602043152, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 4580 + }, + { + "epoch": 3.4292118042584985, + "grad_norm": 0.612680196762085, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 4590 + }, + { + "epoch": 3.4366828539409786, + "grad_norm": 0.6413681507110596, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 4600 + }, + { + "epoch": 3.444153903623459, + "grad_norm": 0.6240990161895752, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4610 + }, + { + "epoch": 3.4516249533059393, + "grad_norm": 0.5095735192298889, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4620 + }, + { + "epoch": 3.45909600298842, + "grad_norm": 0.5699611902236938, + "learning_rate": 0.0002, + "loss": 1.4906, + "step": 4630 + }, + { + "epoch": 3.4665670526709, + "grad_norm": 0.7289775609970093, + "learning_rate": 0.0002, + "loss": 1.5176, + "step": 4640 + }, + { + "epoch": 3.4740381023533806, + "grad_norm": 0.6211609840393066, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 4650 + }, + { + "epoch": 3.481509152035861, + "grad_norm": 0.5714802145957947, + "learning_rate": 0.0002, + "loss": 1.533, + "step": 4660 + }, + { + "epoch": 3.4889802017183413, + "grad_norm": 0.6287049651145935, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 4670 + }, + { + "epoch": 3.496451251400822, + "grad_norm": 0.5480595827102661, + "learning_rate": 0.0002, + "loss": 1.4212, + "step": 4680 + }, + { + "epoch": 3.503922301083302, + "grad_norm": 0.5683253407478333, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4690 + }, + { + "epoch": 3.5113933507657826, + "grad_norm": 0.601140558719635, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4700 + }, + { + "epoch": 3.5188644004482628, + "grad_norm": 0.5344498157501221, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 4710 + }, + { + "epoch": 3.5263354501307433, + "grad_norm": 0.5739690661430359, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4720 + }, + { + "epoch": 3.533806499813224, + "grad_norm": 0.5640085935592651, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 4730 + }, + { + "epoch": 3.541277549495704, + "grad_norm": 0.5967805981636047, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 4740 + }, + { + "epoch": 3.5487485991781846, + "grad_norm": 0.6138835549354553, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4750 + }, + { + "epoch": 3.5562196488606648, + "grad_norm": 0.6779900193214417, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 4760 + }, + { + "epoch": 3.5636906985431454, + "grad_norm": 0.6122010350227356, + "learning_rate": 0.0002, + "loss": 1.4917, + "step": 4770 + }, + { + "epoch": 3.5711617482256255, + "grad_norm": 0.5685241222381592, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4780 + }, + { + "epoch": 3.578632797908106, + "grad_norm": 0.604583203792572, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 4790 + }, + { + "epoch": 3.5861038475905866, + "grad_norm": 0.651165246963501, + "learning_rate": 0.0002, + "loss": 1.4514, + "step": 4800 + }, + { + "epoch": 3.593574897273067, + "grad_norm": 0.6398511528968811, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 4810 + }, + { + "epoch": 3.6010459469555474, + "grad_norm": 0.6444641351699829, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4820 + }, + { + "epoch": 3.6085169966380275, + "grad_norm": 0.6018481850624084, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 4830 + }, + { + "epoch": 3.615988046320508, + "grad_norm": 0.6025291085243225, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 4840 + }, + { + "epoch": 3.623459096002988, + "grad_norm": 0.6810156106948853, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 4850 + }, + { + "epoch": 3.630930145685469, + "grad_norm": 0.6408044695854187, + "learning_rate": 0.0002, + "loss": 1.5299, + "step": 4860 + }, + { + "epoch": 3.6384011953679494, + "grad_norm": 0.5608272552490234, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4870 + }, + { + "epoch": 3.6458722450504295, + "grad_norm": 0.6136814951896667, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 4880 + }, + { + "epoch": 3.65334329473291, + "grad_norm": 0.5927900075912476, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4890 + }, + { + "epoch": 3.66081434441539, + "grad_norm": 0.5336901545524597, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 4900 + }, + { + "epoch": 3.668285394097871, + "grad_norm": 0.7823320627212524, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 4910 + }, + { + "epoch": 3.675756443780351, + "grad_norm": 0.6703504323959351, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 4920 + }, + { + "epoch": 3.6832274934628315, + "grad_norm": 0.6061160564422607, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 4930 + }, + { + "epoch": 3.690698543145312, + "grad_norm": 0.6237227916717529, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4940 + }, + { + "epoch": 3.6981695928277922, + "grad_norm": 0.5985278487205505, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 4950 + }, + { + "epoch": 3.705640642510273, + "grad_norm": 0.6483839750289917, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 4960 + }, + { + "epoch": 3.713111692192753, + "grad_norm": 0.5788805484771729, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 4970 + }, + { + "epoch": 3.7205827418752335, + "grad_norm": 0.5609974265098572, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 4980 + }, + { + "epoch": 3.7280537915577137, + "grad_norm": 0.5681300759315491, + "learning_rate": 0.0002, + "loss": 1.4759, + "step": 4990 + }, + { + "epoch": 3.7355248412401942, + "grad_norm": 0.5860186219215393, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 5000 + }, + { + "epoch": 3.742995890922675, + "grad_norm": 0.5718157291412354, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 5010 + }, + { + "epoch": 3.750466940605155, + "grad_norm": 0.6173721551895142, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 5020 + }, + { + "epoch": 3.7579379902876355, + "grad_norm": 0.629152238368988, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 5030 + }, + { + "epoch": 3.7654090399701157, + "grad_norm": 0.5666284561157227, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 5040 + }, + { + "epoch": 3.7728800896525962, + "grad_norm": 0.6053005456924438, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5050 + }, + { + "epoch": 3.7803511393350764, + "grad_norm": 0.5870583057403564, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 5060 + }, + { + "epoch": 3.787822189017557, + "grad_norm": 0.5422009229660034, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5070 + }, + { + "epoch": 3.7952932387000375, + "grad_norm": 0.5396918058395386, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 5080 + }, + { + "epoch": 3.8027642883825177, + "grad_norm": 0.5544713139533997, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 5090 + }, + { + "epoch": 3.8102353380649983, + "grad_norm": 0.5983749628067017, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5100 + }, + { + "epoch": 3.8177063877474784, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 5110 + }, + { + "epoch": 3.825177437429959, + "grad_norm": 0.5436882376670837, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 5120 + }, + { + "epoch": 3.832648487112439, + "grad_norm": 0.5453617572784424, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 5130 + }, + { + "epoch": 3.8401195367949197, + "grad_norm": 0.6269069314002991, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 5140 + }, + { + "epoch": 3.8475905864774003, + "grad_norm": 0.6189185380935669, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 5150 + }, + { + "epoch": 3.8550616361598804, + "grad_norm": 0.6653388142585754, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 5160 + }, + { + "epoch": 3.862532685842361, + "grad_norm": 0.5771768689155579, + "learning_rate": 0.0002, + "loss": 1.5075, + "step": 5170 + }, + { + "epoch": 3.870003735524841, + "grad_norm": 0.6052790880203247, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5180 + }, + { + "epoch": 3.8774747852073217, + "grad_norm": 0.6572316884994507, + "learning_rate": 0.0002, + "loss": 1.4987, + "step": 5190 + }, + { + "epoch": 3.884945834889802, + "grad_norm": 0.670576810836792, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 5200 + }, + { + "epoch": 3.8924168845722824, + "grad_norm": 0.5728798508644104, + "learning_rate": 0.0002, + "loss": 1.4777, + "step": 5210 + }, + { + "epoch": 3.899887934254763, + "grad_norm": 0.6340774297714233, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 5220 + }, + { + "epoch": 3.907358983937243, + "grad_norm": 0.5981315970420837, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 5230 + }, + { + "epoch": 3.9148300336197237, + "grad_norm": 0.6212025880813599, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 5240 + }, + { + "epoch": 3.922301083302204, + "grad_norm": 0.6202296018600464, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5250 + }, + { + "epoch": 3.9297721329846844, + "grad_norm": 0.6159142255783081, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 5260 + }, + { + "epoch": 3.9372431826671646, + "grad_norm": 0.6519438624382019, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 5270 + }, + { + "epoch": 3.944714232349645, + "grad_norm": 0.539813756942749, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5280 + }, + { + "epoch": 3.9521852820321257, + "grad_norm": 0.6443665027618408, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 5290 + }, + { + "epoch": 3.959656331714606, + "grad_norm": 0.6635757684707642, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 5300 + }, + { + "epoch": 3.9671273813970864, + "grad_norm": 0.589363157749176, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 5310 + }, + { + "epoch": 3.9745984310795666, + "grad_norm": 0.5788735747337341, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 5320 + }, + { + "epoch": 3.982069480762047, + "grad_norm": 0.5976864695549011, + "learning_rate": 0.0002, + "loss": 1.5607, + "step": 5330 + }, + { + "epoch": 3.9895405304445273, + "grad_norm": 0.6624067425727844, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 5340 + }, + { + "epoch": 3.997011580127008, + "grad_norm": 0.6738956570625305, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 5350 + }, + { + "epoch": 4.0, + "eval_loss": 1.868006944656372, + "eval_runtime": 38.5153, + "eval_samples_per_second": 13.371, + "eval_steps_per_second": 1.688, + "step": 5354 + }, + { + "epoch": 4.004482629809488, + "grad_norm": 0.6023468971252441, + "learning_rate": 0.0002, + "loss": 1.4535, + "step": 5360 + }, + { + "epoch": 4.011953679491969, + "grad_norm": 0.8589285612106323, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 5370 + }, + { + "epoch": 4.019424729174449, + "grad_norm": 0.7477491497993469, + "learning_rate": 0.0002, + "loss": 1.3952, + "step": 5380 + }, + { + "epoch": 4.02689577885693, + "grad_norm": 0.7601922154426575, + "learning_rate": 0.0002, + "loss": 1.3745, + "step": 5390 + }, + { + "epoch": 4.03436682853941, + "grad_norm": 0.8115614056587219, + "learning_rate": 0.0002, + "loss": 1.4133, + "step": 5400 + }, + { + "epoch": 4.04183787822189, + "grad_norm": 0.669925332069397, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 5410 + }, + { + "epoch": 4.04930892790437, + "grad_norm": 0.8091904520988464, + "learning_rate": 0.0002, + "loss": 1.2835, + "step": 5420 + }, + { + "epoch": 4.056779977586851, + "grad_norm": 0.709405779838562, + "learning_rate": 0.0002, + "loss": 1.3615, + "step": 5430 + }, + { + "epoch": 4.064251027269331, + "grad_norm": 1.0006179809570312, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 5440 + }, + { + "epoch": 4.071722076951811, + "grad_norm": 0.7017965912818909, + "learning_rate": 0.0002, + "loss": 1.3491, + "step": 5450 + }, + { + "epoch": 4.0791931266342925, + "grad_norm": 0.8991572260856628, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 5460 + }, + { + "epoch": 4.086664176316773, + "grad_norm": 0.9064797759056091, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 5470 + }, + { + "epoch": 4.094135225999253, + "grad_norm": 0.7981749176979065, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 5480 + }, + { + "epoch": 4.101606275681733, + "grad_norm": 0.7280883193016052, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 5490 + }, + { + "epoch": 4.109077325364214, + "grad_norm": 0.7419600486755371, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 5500 + }, + { + "epoch": 4.116548375046694, + "grad_norm": 0.8019949197769165, + "learning_rate": 0.0002, + "loss": 1.3199, + "step": 5510 + }, + { + "epoch": 4.124019424729174, + "grad_norm": 0.7501229047775269, + "learning_rate": 0.0002, + "loss": 1.3133, + "step": 5520 + }, + { + "epoch": 4.131490474411655, + "grad_norm": 0.8166249990463257, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 5530 + }, + { + "epoch": 4.138961524094135, + "grad_norm": 0.9728496074676514, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 5540 + }, + { + "epoch": 4.1464325737766154, + "grad_norm": 0.7590922117233276, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 5550 + }, + { + "epoch": 4.153903623459096, + "grad_norm": 0.7759010791778564, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 5560 + }, + { + "epoch": 4.161374673141577, + "grad_norm": 0.9057986736297607, + "learning_rate": 0.0002, + "loss": 1.3635, + "step": 5570 + }, + { + "epoch": 4.168845722824057, + "grad_norm": 0.8853937983512878, + "learning_rate": 0.0002, + "loss": 1.4152, + "step": 5580 + }, + { + "epoch": 4.176316772506537, + "grad_norm": 0.7070684432983398, + "learning_rate": 0.0002, + "loss": 1.3633, + "step": 5590 + }, + { + "epoch": 4.183787822189018, + "grad_norm": 0.7649410963058472, + "learning_rate": 0.0002, + "loss": 1.3218, + "step": 5600 + }, + { + "epoch": 4.191258871871498, + "grad_norm": 1.2048029899597168, + "learning_rate": 0.0002, + "loss": 1.3857, + "step": 5610 + }, + { + "epoch": 4.198729921553978, + "grad_norm": 0.7986605763435364, + "learning_rate": 0.0002, + "loss": 1.3629, + "step": 5620 + }, + { + "epoch": 4.206200971236458, + "grad_norm": 0.8151885867118835, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 5630 + }, + { + "epoch": 4.213672020918939, + "grad_norm": 0.7719064354896545, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 5640 + }, + { + "epoch": 4.2211430706014195, + "grad_norm": 0.8422448039054871, + "learning_rate": 0.0002, + "loss": 1.3852, + "step": 5650 + }, + { + "epoch": 4.2286141202839, + "grad_norm": 0.7017164826393127, + "learning_rate": 0.0002, + "loss": 1.3321, + "step": 5660 + }, + { + "epoch": 4.236085169966381, + "grad_norm": 0.8559677600860596, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 5670 + }, + { + "epoch": 4.243556219648861, + "grad_norm": 0.8216157555580139, + "learning_rate": 0.0002, + "loss": 1.3701, + "step": 5680 + }, + { + "epoch": 4.251027269331341, + "grad_norm": 0.7681755423545837, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 5690 + }, + { + "epoch": 4.258498319013821, + "grad_norm": 0.811665952205658, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 5700 + }, + { + "epoch": 4.265969368696302, + "grad_norm": 0.7242204546928406, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 5710 + }, + { + "epoch": 4.273440418378782, + "grad_norm": 0.7570181488990784, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 5720 + }, + { + "epoch": 4.280911468061262, + "grad_norm": 0.8951969146728516, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 5730 + }, + { + "epoch": 4.288382517743743, + "grad_norm": 0.7222902178764343, + "learning_rate": 0.0002, + "loss": 1.3895, + "step": 5740 + }, + { + "epoch": 4.2958535674262235, + "grad_norm": 0.8508469462394714, + "learning_rate": 0.0002, + "loss": 1.4155, + "step": 5750 + }, + { + "epoch": 4.303324617108704, + "grad_norm": 0.7215430736541748, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 5760 + }, + { + "epoch": 4.310795666791184, + "grad_norm": 0.8774884939193726, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 5770 + }, + { + "epoch": 4.318266716473665, + "grad_norm": 0.8354552984237671, + "learning_rate": 0.0002, + "loss": 1.427, + "step": 5780 + }, + { + "epoch": 4.325737766156145, + "grad_norm": 0.6938814520835876, + "learning_rate": 0.0002, + "loss": 1.3222, + "step": 5790 + }, + { + "epoch": 4.333208815838625, + "grad_norm": 0.78675377368927, + "learning_rate": 0.0002, + "loss": 1.3589, + "step": 5800 + }, + { + "epoch": 4.340679865521106, + "grad_norm": 0.7147697806358337, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 5810 + }, + { + "epoch": 4.348150915203586, + "grad_norm": 0.7693623304367065, + "learning_rate": 0.0002, + "loss": 1.3597, + "step": 5820 + }, + { + "epoch": 4.355621964886066, + "grad_norm": 0.856517493724823, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 5830 + }, + { + "epoch": 4.3630930145685465, + "grad_norm": 0.7200973033905029, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 5840 + }, + { + "epoch": 4.3705640642510275, + "grad_norm": 0.743281364440918, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 5850 + }, + { + "epoch": 4.378035113933508, + "grad_norm": 0.7627727389335632, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 5860 + }, + { + "epoch": 4.385506163615988, + "grad_norm": 0.7238836884498596, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 5870 + }, + { + "epoch": 4.392977213298469, + "grad_norm": 0.7253410816192627, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5880 + }, + { + "epoch": 4.400448262980949, + "grad_norm": 0.8232238292694092, + "learning_rate": 0.0002, + "loss": 1.3774, + "step": 5890 + }, + { + "epoch": 4.407919312663429, + "grad_norm": 0.8778504729270935, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 5900 + }, + { + "epoch": 4.415390362345909, + "grad_norm": 0.7639474868774414, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5910 + }, + { + "epoch": 4.42286141202839, + "grad_norm": 0.7666519284248352, + "learning_rate": 0.0002, + "loss": 1.3862, + "step": 5920 + }, + { + "epoch": 4.43033246171087, + "grad_norm": 0.867132842540741, + "learning_rate": 0.0002, + "loss": 1.4168, + "step": 5930 + }, + { + "epoch": 4.4378035113933505, + "grad_norm": 0.7571166753768921, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 5940 + }, + { + "epoch": 4.4452745610758315, + "grad_norm": 0.7911370992660522, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 5950 + }, + { + "epoch": 4.452745610758312, + "grad_norm": 0.8844250440597534, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 5960 + }, + { + "epoch": 4.460216660440792, + "grad_norm": 0.7336231470108032, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 5970 + }, + { + "epoch": 4.467687710123272, + "grad_norm": 0.8162738084793091, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 5980 + }, + { + "epoch": 4.475158759805753, + "grad_norm": 0.7413017153739929, + "learning_rate": 0.0002, + "loss": 1.393, + "step": 5990 + }, + { + "epoch": 4.482629809488233, + "grad_norm": 0.7215432524681091, + "learning_rate": 0.0002, + "loss": 1.3712, + "step": 6000 + }, + { + "epoch": 4.490100859170713, + "grad_norm": 0.8943389058113098, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 6010 + }, + { + "epoch": 4.497571908853194, + "grad_norm": 0.7850823998451233, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 6020 + }, + { + "epoch": 4.505042958535674, + "grad_norm": 0.8117504119873047, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 6030 + }, + { + "epoch": 4.5125140082181545, + "grad_norm": 0.8381605744361877, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 6040 + }, + { + "epoch": 4.519985057900635, + "grad_norm": 0.7964059710502625, + "learning_rate": 0.0002, + "loss": 1.3829, + "step": 6050 + }, + { + "epoch": 4.527456107583116, + "grad_norm": 0.7935128211975098, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 6060 + }, + { + "epoch": 4.534927157265596, + "grad_norm": 0.8725124597549438, + "learning_rate": 0.0002, + "loss": 1.3994, + "step": 6070 + }, + { + "epoch": 4.542398206948076, + "grad_norm": 0.880325198173523, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6080 + }, + { + "epoch": 4.549869256630557, + "grad_norm": 0.7220637202262878, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 6090 + }, + { + "epoch": 4.557340306313037, + "grad_norm": 0.6908547878265381, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 6100 + }, + { + "epoch": 4.564811355995517, + "grad_norm": 0.797931969165802, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 6110 + }, + { + "epoch": 4.572282405677997, + "grad_norm": 0.7056134343147278, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 6120 + }, + { + "epoch": 4.579753455360478, + "grad_norm": 0.7850478887557983, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 6130 + }, + { + "epoch": 4.5872245050429585, + "grad_norm": 0.8112621307373047, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 6140 + }, + { + "epoch": 4.594695554725439, + "grad_norm": 0.7040849328041077, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 6150 + }, + { + "epoch": 4.60216660440792, + "grad_norm": 0.7214553952217102, + "learning_rate": 0.0002, + "loss": 1.3526, + "step": 6160 + }, + { + "epoch": 4.6096376540904, + "grad_norm": 0.8616511821746826, + "learning_rate": 0.0002, + "loss": 1.3932, + "step": 6170 + }, + { + "epoch": 4.61710870377288, + "grad_norm": 0.8374658226966858, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 6180 + }, + { + "epoch": 4.62457975345536, + "grad_norm": 0.6761606931686401, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 6190 + }, + { + "epoch": 4.632050803137841, + "grad_norm": 0.768028199672699, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 6200 + }, + { + "epoch": 4.639521852820321, + "grad_norm": 0.9372717142105103, + "learning_rate": 0.0002, + "loss": 1.3772, + "step": 6210 + }, + { + "epoch": 4.646992902502801, + "grad_norm": 0.7906546592712402, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 6220 + }, + { + "epoch": 4.654463952185282, + "grad_norm": 0.7376723289489746, + "learning_rate": 0.0002, + "loss": 1.3962, + "step": 6230 + }, + { + "epoch": 4.6619350018677626, + "grad_norm": 0.8972630500793457, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 6240 + }, + { + "epoch": 4.669406051550243, + "grad_norm": 0.8261756300926208, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 6250 + }, + { + "epoch": 4.676877101232723, + "grad_norm": 0.7512393593788147, + "learning_rate": 0.0002, + "loss": 1.3267, + "step": 6260 + }, + { + "epoch": 4.684348150915204, + "grad_norm": 0.7132362127304077, + "learning_rate": 0.0002, + "loss": 1.4278, + "step": 6270 + }, + { + "epoch": 4.691819200597684, + "grad_norm": 0.7690575122833252, + "learning_rate": 0.0002, + "loss": 1.4299, + "step": 6280 + }, + { + "epoch": 4.699290250280164, + "grad_norm": 0.9886258840560913, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 6290 + }, + { + "epoch": 4.706761299962645, + "grad_norm": 0.9502435922622681, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 6300 + }, + { + "epoch": 4.714232349645125, + "grad_norm": 0.702255129814148, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 6310 + }, + { + "epoch": 4.721703399327605, + "grad_norm": 0.7713103890419006, + "learning_rate": 0.0002, + "loss": 1.4447, + "step": 6320 + }, + { + "epoch": 4.7291744490100855, + "grad_norm": 0.7778580784797668, + "learning_rate": 0.0002, + "loss": 1.4392, + "step": 6330 + }, + { + "epoch": 4.736645498692567, + "grad_norm": 0.7275111079216003, + "learning_rate": 0.0002, + "loss": 1.4169, + "step": 6340 + }, + { + "epoch": 4.744116548375047, + "grad_norm": 0.7728744149208069, + "learning_rate": 0.0002, + "loss": 1.4429, + "step": 6350 + }, + { + "epoch": 4.751587598057527, + "grad_norm": 0.9724260568618774, + "learning_rate": 0.0002, + "loss": 1.3756, + "step": 6360 + }, + { + "epoch": 4.759058647740007, + "grad_norm": 0.7505622506141663, + "learning_rate": 0.0002, + "loss": 1.3358, + "step": 6370 + }, + { + "epoch": 4.766529697422488, + "grad_norm": 0.7994682788848877, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 6380 + }, + { + "epoch": 4.774000747104968, + "grad_norm": 0.8432038426399231, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 6390 + }, + { + "epoch": 4.781471796787448, + "grad_norm": 0.7436022758483887, + "learning_rate": 0.0002, + "loss": 1.4606, + "step": 6400 + }, + { + "epoch": 4.788942846469929, + "grad_norm": 0.7709194421768188, + "learning_rate": 0.0002, + "loss": 1.3461, + "step": 6410 + }, + { + "epoch": 4.796413896152409, + "grad_norm": 0.8798436522483826, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 6420 + }, + { + "epoch": 4.80388494583489, + "grad_norm": 0.790189266204834, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 6430 + }, + { + "epoch": 4.811355995517371, + "grad_norm": 0.6824303865432739, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 6440 + }, + { + "epoch": 4.818827045199851, + "grad_norm": 0.7501044869422913, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 6450 + }, + { + "epoch": 4.826298094882331, + "grad_norm": 0.8840398192405701, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 6460 + }, + { + "epoch": 4.833769144564811, + "grad_norm": 0.7812688946723938, + "learning_rate": 0.0002, + "loss": 1.4412, + "step": 6470 + }, + { + "epoch": 4.841240194247292, + "grad_norm": 0.7429926991462708, + "learning_rate": 0.0002, + "loss": 1.4299, + "step": 6480 + }, + { + "epoch": 4.848711243929772, + "grad_norm": 0.7778021693229675, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 6490 + }, + { + "epoch": 4.856182293612252, + "grad_norm": 0.8270702362060547, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 6500 + }, + { + "epoch": 4.863653343294732, + "grad_norm": 0.6960513591766357, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 6510 + }, + { + "epoch": 4.8711243929772134, + "grad_norm": 0.7728942632675171, + "learning_rate": 0.0002, + "loss": 1.376, + "step": 6520 + }, + { + "epoch": 4.878595442659694, + "grad_norm": 0.7377303838729858, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 6530 + }, + { + "epoch": 4.886066492342174, + "grad_norm": 0.7257253527641296, + "learning_rate": 0.0002, + "loss": 1.3846, + "step": 6540 + }, + { + "epoch": 4.893537542024655, + "grad_norm": 0.7875821590423584, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 6550 + }, + { + "epoch": 4.901008591707135, + "grad_norm": 0.8346304297447205, + "learning_rate": 0.0002, + "loss": 1.357, + "step": 6560 + }, + { + "epoch": 4.908479641389615, + "grad_norm": 0.7710739374160767, + "learning_rate": 0.0002, + "loss": 1.4522, + "step": 6570 + }, + { + "epoch": 4.915950691072096, + "grad_norm": 0.7015138268470764, + "learning_rate": 0.0002, + "loss": 1.4465, + "step": 6580 + }, + { + "epoch": 4.923421740754576, + "grad_norm": 0.8707432150840759, + "learning_rate": 0.0002, + "loss": 1.435, + "step": 6590 + }, + { + "epoch": 4.930892790437056, + "grad_norm": 0.786601185798645, + "learning_rate": 0.0002, + "loss": 1.2968, + "step": 6600 + }, + { + "epoch": 4.938363840119536, + "grad_norm": 0.978519082069397, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 6610 + }, + { + "epoch": 4.9458348898020175, + "grad_norm": 0.8102927207946777, + "learning_rate": 0.0002, + "loss": 1.3997, + "step": 6620 + }, + { + "epoch": 4.953305939484498, + "grad_norm": 0.7628704309463501, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 6630 + }, + { + "epoch": 4.960776989166978, + "grad_norm": 0.8053455352783203, + "learning_rate": 0.0002, + "loss": 1.3774, + "step": 6640 + }, + { + "epoch": 4.968248038849458, + "grad_norm": 0.8680412173271179, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 6650 + }, + { + "epoch": 4.975719088531939, + "grad_norm": 0.7415758371353149, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 6660 + }, + { + "epoch": 4.983190138214419, + "grad_norm": 0.7730312347412109, + "learning_rate": 0.0002, + "loss": 1.3793, + "step": 6670 + }, + { + "epoch": 4.990661187896899, + "grad_norm": 0.7924041152000427, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 6680 + }, + { + "epoch": 4.99813223757938, + "grad_norm": 0.8677893877029419, + "learning_rate": 0.0002, + "loss": 1.4137, + "step": 6690 + }, + { + "epoch": 4.999626447515876, + "eval_loss": 1.9444633722305298, + "eval_runtime": 39.3488, + "eval_samples_per_second": 13.088, + "eval_steps_per_second": 1.652, + "step": 6692 + } + ], + "logging_steps": 10, + "max_steps": 10704, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.0971391488557056e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-6692/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6f7272d48984d2d8648f89d62a64d97cf42adcc4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46904eb2dc787478b572840fe84051b93ba02c40147865a235946aefa6c1711a +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2febd43ac8c979c08c9655a9f74869e9a06d6e6a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c690d8d86a430610e78803a274c3ac920a0a69e8dfb445585a26730a9bd6a18a +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c9f197d87560ca90f917f637cb1077d6b4503e5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:042d0f7c0322075840dbdbfc8564dfcb7d871dae8201d1a5e5ea29425c005636 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b8422814b4a9fcc34918e1564540e393428e2a0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc0b46f21e02353093efbfa2699d5185e23bdb76aaa14d3a0676bc7227f85624 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..146718c3cbc7117cf6f0e7a0ba86f7ce72aa9dc6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/trainer_state.json @@ -0,0 +1,5702 @@ +{ + "best_metric": 1.8046749830245972, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 8031, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007471049682480389, + "grad_norm": 0.4912872612476349, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 10 + }, + { + "epoch": 0.014942099364960777, + "grad_norm": 0.4856316149234772, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 20 + }, + { + "epoch": 0.022413149047441166, + "grad_norm": 0.47683125734329224, + "learning_rate": 0.0002, + "loss": 2.0957, + "step": 30 + }, + { + "epoch": 0.029884198729921554, + "grad_norm": 0.515082597732544, + "learning_rate": 0.0002, + "loss": 1.8908, + "step": 40 + }, + { + "epoch": 0.03735524841240194, + "grad_norm": 0.5299215316772461, + "learning_rate": 0.0002, + "loss": 1.9704, + "step": 50 + }, + { + "epoch": 0.04482629809488233, + "grad_norm": 0.4951399862766266, + "learning_rate": 0.0002, + "loss": 1.9225, + "step": 60 + }, + { + "epoch": 0.05229734777736272, + "grad_norm": 0.48079821467399597, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05976839745984311, + "grad_norm": 0.49402132630348206, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 80 + }, + { + "epoch": 0.0672394471423235, + "grad_norm": 0.4778193235397339, + "learning_rate": 0.0002, + "loss": 1.8691, + "step": 90 + }, + { + "epoch": 0.07471049682480388, + "grad_norm": 0.42472657561302185, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 100 + }, + { + "epoch": 0.08218154650728428, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0002, + "loss": 1.8744, + "step": 110 + }, + { + "epoch": 0.08965259618976466, + "grad_norm": 0.4472862780094147, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 120 + }, + { + "epoch": 0.09712364587224505, + "grad_norm": 0.42596298456192017, + "learning_rate": 0.0002, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.10459469555472543, + "grad_norm": 0.46645811200141907, + "learning_rate": 0.0002, + "loss": 1.8015, + "step": 140 + }, + { + "epoch": 0.11206574523720583, + "grad_norm": 0.41041234135627747, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 150 + }, + { + "epoch": 0.11953679491968622, + "grad_norm": 0.5329819917678833, + "learning_rate": 0.0002, + "loss": 1.8276, + "step": 160 + }, + { + "epoch": 0.1270078446021666, + "grad_norm": 0.4065922200679779, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 170 + }, + { + "epoch": 0.134478894284647, + "grad_norm": 0.38406994938850403, + "learning_rate": 0.0002, + "loss": 1.8559, + "step": 180 + }, + { + "epoch": 0.14194994396712737, + "grad_norm": 0.4246881306171417, + "learning_rate": 0.0002, + "loss": 1.8647, + "step": 190 + }, + { + "epoch": 0.14942099364960776, + "grad_norm": 0.35136649012565613, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 200 + }, + { + "epoch": 0.15689204333208817, + "grad_norm": 0.43252742290496826, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.16436309301456856, + "grad_norm": 0.39236941933631897, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 220 + }, + { + "epoch": 0.17183414269704894, + "grad_norm": 0.3748249113559723, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 230 + }, + { + "epoch": 0.17930519237952933, + "grad_norm": 0.6432855725288391, + "learning_rate": 0.0002, + "loss": 1.866, + "step": 240 + }, + { + "epoch": 0.1867762420620097, + "grad_norm": 0.34874802827835083, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 250 + }, + { + "epoch": 0.1942472917444901, + "grad_norm": 0.3721984326839447, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 260 + }, + { + "epoch": 0.20171834142697048, + "grad_norm": 0.4339311420917511, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 270 + }, + { + "epoch": 0.20918939110945087, + "grad_norm": 0.4018215537071228, + "learning_rate": 0.0002, + "loss": 1.8665, + "step": 280 + }, + { + "epoch": 0.21666044079193125, + "grad_norm": 0.3278839886188507, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 290 + }, + { + "epoch": 0.22413149047441167, + "grad_norm": 0.36146077513694763, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 300 + }, + { + "epoch": 0.23160254015689205, + "grad_norm": 0.38175010681152344, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 310 + }, + { + "epoch": 0.23907358983937244, + "grad_norm": 0.44776618480682373, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.24654463952185282, + "grad_norm": 0.3933652937412262, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 330 + }, + { + "epoch": 0.2540156892043332, + "grad_norm": 0.3515005111694336, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 340 + }, + { + "epoch": 0.2614867388868136, + "grad_norm": 0.6683304309844971, + "learning_rate": 0.0002, + "loss": 1.8653, + "step": 350 + }, + { + "epoch": 0.268957788569294, + "grad_norm": 0.37093454599380493, + "learning_rate": 0.0002, + "loss": 1.8797, + "step": 360 + }, + { + "epoch": 0.2764288382517744, + "grad_norm": 0.3450651168823242, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 370 + }, + { + "epoch": 0.28389988793425475, + "grad_norm": 0.5140917301177979, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 380 + }, + { + "epoch": 0.29137093761673516, + "grad_norm": 0.32885563373565674, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 390 + }, + { + "epoch": 0.2988419872992155, + "grad_norm": 0.33962297439575195, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.30631303698169593, + "grad_norm": 0.3723141849040985, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 410 + }, + { + "epoch": 0.31378408666417634, + "grad_norm": 0.37173134088516235, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 420 + }, + { + "epoch": 0.3212551363466567, + "grad_norm": 0.33736956119537354, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 430 + }, + { + "epoch": 0.3287261860291371, + "grad_norm": 0.3602448105812073, + "learning_rate": 0.0002, + "loss": 1.8367, + "step": 440 + }, + { + "epoch": 0.33619723571161747, + "grad_norm": 0.3569699227809906, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 450 + }, + { + "epoch": 0.3436682853940979, + "grad_norm": 0.31009167432785034, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 460 + }, + { + "epoch": 0.35113933507657824, + "grad_norm": 0.5278693437576294, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 470 + }, + { + "epoch": 0.35861038475905865, + "grad_norm": 0.3587537109851837, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 480 + }, + { + "epoch": 0.366081434441539, + "grad_norm": 0.3859670162200928, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 490 + }, + { + "epoch": 0.3735524841240194, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 500 + }, + { + "epoch": 0.38102353380649984, + "grad_norm": 0.35052940249443054, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 510 + }, + { + "epoch": 0.3884945834889802, + "grad_norm": 0.2979494333267212, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 520 + }, + { + "epoch": 0.3959656331714606, + "grad_norm": 0.3062683343887329, + "learning_rate": 0.0002, + "loss": 1.8641, + "step": 530 + }, + { + "epoch": 0.40343668285394096, + "grad_norm": 0.3172847330570221, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 540 + }, + { + "epoch": 0.4109077325364214, + "grad_norm": 0.360435426235199, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 550 + }, + { + "epoch": 0.41837878221890173, + "grad_norm": 0.3427872359752655, + "learning_rate": 0.0002, + "loss": 1.9054, + "step": 560 + }, + { + "epoch": 0.42584983190138215, + "grad_norm": 0.34036558866500854, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 570 + }, + { + "epoch": 0.4333208815838625, + "grad_norm": 0.3365345299243927, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 580 + }, + { + "epoch": 0.4407919312663429, + "grad_norm": 0.35619041323661804, + "learning_rate": 0.0002, + "loss": 1.8328, + "step": 590 + }, + { + "epoch": 0.44826298094882333, + "grad_norm": 0.3569088280200958, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 600 + }, + { + "epoch": 0.4557340306313037, + "grad_norm": 0.3581278622150421, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 610 + }, + { + "epoch": 0.4632050803137841, + "grad_norm": 0.43197110295295715, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 620 + }, + { + "epoch": 0.47067612999626446, + "grad_norm": 0.33966198563575745, + "learning_rate": 0.0002, + "loss": 1.8257, + "step": 630 + }, + { + "epoch": 0.47814717967874487, + "grad_norm": 0.3343866467475891, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 640 + }, + { + "epoch": 0.48561822936122523, + "grad_norm": 0.33878564834594727, + "learning_rate": 0.0002, + "loss": 1.8191, + "step": 650 + }, + { + "epoch": 0.49308927904370564, + "grad_norm": 0.387195885181427, + "learning_rate": 0.0002, + "loss": 1.8801, + "step": 660 + }, + { + "epoch": 0.500560328726186, + "grad_norm": 0.3755440413951874, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 670 + }, + { + "epoch": 0.5080313784086664, + "grad_norm": 0.3272816836833954, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 680 + }, + { + "epoch": 0.5155024280911468, + "grad_norm": 0.36063864827156067, + "learning_rate": 0.0002, + "loss": 1.8156, + "step": 690 + }, + { + "epoch": 0.5229734777736272, + "grad_norm": 0.35317373275756836, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 700 + }, + { + "epoch": 0.5304445274561076, + "grad_norm": 0.3561195433139801, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 710 + }, + { + "epoch": 0.537915577138588, + "grad_norm": 0.31124624609947205, + "learning_rate": 0.0002, + "loss": 1.8149, + "step": 720 + }, + { + "epoch": 0.5453866268210683, + "grad_norm": 0.3294544517993927, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 730 + }, + { + "epoch": 0.5528576765035488, + "grad_norm": 0.31933900713920593, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 740 + }, + { + "epoch": 0.5603287261860291, + "grad_norm": 0.3226020634174347, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 750 + }, + { + "epoch": 0.5677997758685095, + "grad_norm": 0.3147525489330292, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 760 + }, + { + "epoch": 0.57527082555099, + "grad_norm": 0.32234328985214233, + "learning_rate": 0.0002, + "loss": 1.9028, + "step": 770 + }, + { + "epoch": 0.5827418752334703, + "grad_norm": 0.3258664309978485, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 780 + }, + { + "epoch": 0.5902129249159507, + "grad_norm": 0.3166961967945099, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 790 + }, + { + "epoch": 0.597683974598431, + "grad_norm": 0.35621458292007446, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 800 + }, + { + "epoch": 0.6051550242809115, + "grad_norm": 0.3236999213695526, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 810 + }, + { + "epoch": 0.6126260739633919, + "grad_norm": 0.2892923653125763, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 820 + }, + { + "epoch": 0.6200971236458722, + "grad_norm": 0.4098321497440338, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 830 + }, + { + "epoch": 0.6275681733283527, + "grad_norm": 0.3337118923664093, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 840 + }, + { + "epoch": 0.635039223010833, + "grad_norm": 0.30416029691696167, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 850 + }, + { + "epoch": 0.6425102726933134, + "grad_norm": 0.3361026346683502, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 860 + }, + { + "epoch": 0.6499813223757938, + "grad_norm": 0.3537365198135376, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 870 + }, + { + "epoch": 0.6574523720582742, + "grad_norm": 0.33854469656944275, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 880 + }, + { + "epoch": 0.6649234217407546, + "grad_norm": 0.3332272469997406, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 890 + }, + { + "epoch": 0.6723944714232349, + "grad_norm": 0.34954726696014404, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 900 + }, + { + "epoch": 0.6798655211057153, + "grad_norm": 0.2921750247478485, + "learning_rate": 0.0002, + "loss": 1.7917, + "step": 910 + }, + { + "epoch": 0.6873365707881958, + "grad_norm": 0.30508682131767273, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 920 + }, + { + "epoch": 0.6948076204706761, + "grad_norm": 0.32268425822257996, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 930 + }, + { + "epoch": 0.7022786701531565, + "grad_norm": 0.2844390869140625, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 940 + }, + { + "epoch": 0.709749719835637, + "grad_norm": 0.31263890862464905, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 950 + }, + { + "epoch": 0.7172207695181173, + "grad_norm": 0.3626808822154999, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 960 + }, + { + "epoch": 0.7246918192005977, + "grad_norm": 0.3322749733924866, + "learning_rate": 0.0002, + "loss": 1.853, + "step": 970 + }, + { + "epoch": 0.732162868883078, + "grad_norm": 0.29177871346473694, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 980 + }, + { + "epoch": 0.7396339185655585, + "grad_norm": 0.35405513644218445, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 990 + }, + { + "epoch": 0.7471049682480388, + "grad_norm": 0.39318400621414185, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1000 + }, + { + "epoch": 0.7545760179305192, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1010 + }, + { + "epoch": 0.7620470676129997, + "grad_norm": 0.3271748721599579, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 1020 + }, + { + "epoch": 0.76951811729548, + "grad_norm": 0.30883970856666565, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1030 + }, + { + "epoch": 0.7769891669779604, + "grad_norm": 0.3411838412284851, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 1040 + }, + { + "epoch": 0.7844602166604407, + "grad_norm": 0.30608129501342773, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 1050 + }, + { + "epoch": 0.7919312663429212, + "grad_norm": 0.30899080634117126, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 1060 + }, + { + "epoch": 0.7994023160254016, + "grad_norm": 0.3160453140735626, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 1070 + }, + { + "epoch": 0.8068733657078819, + "grad_norm": 0.30947187542915344, + "learning_rate": 0.0002, + "loss": 1.8452, + "step": 1080 + }, + { + "epoch": 0.8143444153903624, + "grad_norm": 0.3103134036064148, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1090 + }, + { + "epoch": 0.8218154650728428, + "grad_norm": 0.31771138310432434, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 1100 + }, + { + "epoch": 0.8292865147553231, + "grad_norm": 0.5860997438430786, + "learning_rate": 0.0002, + "loss": 1.7918, + "step": 1110 + }, + { + "epoch": 0.8367575644378035, + "grad_norm": 0.3230148255825043, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 1120 + }, + { + "epoch": 0.8442286141202839, + "grad_norm": 0.29611510038375854, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 1130 + }, + { + "epoch": 0.8516996638027643, + "grad_norm": 0.3373654782772064, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 1140 + }, + { + "epoch": 0.8591707134852447, + "grad_norm": 0.3474279046058655, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1150 + }, + { + "epoch": 0.866641763167725, + "grad_norm": 0.35057875514030457, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1160 + }, + { + "epoch": 0.8741128128502055, + "grad_norm": 0.39537495374679565, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 1170 + }, + { + "epoch": 0.8815838625326858, + "grad_norm": 0.3714233636856079, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1180 + }, + { + "epoch": 0.8890549122151662, + "grad_norm": 0.2950296998023987, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1190 + }, + { + "epoch": 0.8965259618976467, + "grad_norm": 0.38182979822158813, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 1200 + }, + { + "epoch": 0.903997011580127, + "grad_norm": 0.27883678674697876, + "learning_rate": 0.0002, + "loss": 1.827, + "step": 1210 + }, + { + "epoch": 0.9114680612626074, + "grad_norm": 0.33874374628067017, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1220 + }, + { + "epoch": 0.9189391109450877, + "grad_norm": 0.3014272153377533, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1230 + }, + { + "epoch": 0.9264101606275682, + "grad_norm": 0.3194271922111511, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 1240 + }, + { + "epoch": 0.9338812103100486, + "grad_norm": 0.3049403429031372, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1250 + }, + { + "epoch": 0.9413522599925289, + "grad_norm": 0.30621254444122314, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 1260 + }, + { + "epoch": 0.9488233096750094, + "grad_norm": 0.28675132989883423, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 1270 + }, + { + "epoch": 0.9562943593574897, + "grad_norm": 0.3322032690048218, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1280 + }, + { + "epoch": 0.9637654090399701, + "grad_norm": 0.35408294200897217, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1290 + }, + { + "epoch": 0.9712364587224505, + "grad_norm": 0.36386919021606445, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1300 + }, + { + "epoch": 0.9787075084049309, + "grad_norm": 0.32338324189186096, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 1310 + }, + { + "epoch": 0.9861785580874113, + "grad_norm": 0.3714013993740082, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 1320 + }, + { + "epoch": 0.9936496077698916, + "grad_norm": 0.3133082389831543, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 1330 + }, + { + "epoch": 0.9996264475158759, + "eval_loss": 1.8051470518112183, + "eval_runtime": 38.6332, + "eval_samples_per_second": 13.331, + "eval_steps_per_second": 1.682, + "step": 1338 + }, + { + "epoch": 1.001120657452372, + "grad_norm": 0.31595754623413086, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 1340 + }, + { + "epoch": 1.0085917071348525, + "grad_norm": 0.3095700144767761, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1350 + }, + { + "epoch": 1.0160627568173328, + "grad_norm": 0.34677496552467346, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1360 + }, + { + "epoch": 1.0235338064998132, + "grad_norm": 0.29108840227127075, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1370 + }, + { + "epoch": 1.0310048561822935, + "grad_norm": 0.32356950640678406, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 1380 + }, + { + "epoch": 1.038475905864774, + "grad_norm": 0.4200669229030609, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1390 + }, + { + "epoch": 1.0459469555472545, + "grad_norm": 0.3283711373806, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 1400 + }, + { + "epoch": 1.0534180052297348, + "grad_norm": 0.32898256182670593, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1410 + }, + { + "epoch": 1.0608890549122152, + "grad_norm": 0.38790300488471985, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 1420 + }, + { + "epoch": 1.0683601045946955, + "grad_norm": 0.339800089597702, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1430 + }, + { + "epoch": 1.075831154277176, + "grad_norm": 0.3548751175403595, + "learning_rate": 0.0002, + "loss": 1.7076, + "step": 1440 + }, + { + "epoch": 1.0833022039596563, + "grad_norm": 0.35114359855651855, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1450 + }, + { + "epoch": 1.0907732536421366, + "grad_norm": 0.35226720571517944, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 1460 + }, + { + "epoch": 1.0982443033246172, + "grad_norm": 0.33665576577186584, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 1470 + }, + { + "epoch": 1.1057153530070976, + "grad_norm": 0.363889217376709, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1480 + }, + { + "epoch": 1.113186402689578, + "grad_norm": 0.3826201856136322, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 1490 + }, + { + "epoch": 1.1206574523720583, + "grad_norm": 0.34058740735054016, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 1500 + }, + { + "epoch": 1.1281285020545386, + "grad_norm": 0.3462134301662445, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1510 + }, + { + "epoch": 1.135599551737019, + "grad_norm": 0.3396756052970886, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 1520 + }, + { + "epoch": 1.1430706014194993, + "grad_norm": 0.32004743814468384, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1530 + }, + { + "epoch": 1.15054165110198, + "grad_norm": 0.3397733271121979, + "learning_rate": 0.0002, + "loss": 1.743, + "step": 1540 + }, + { + "epoch": 1.1580127007844603, + "grad_norm": 0.3783262073993683, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 1550 + }, + { + "epoch": 1.1654837504669406, + "grad_norm": 0.35121291875839233, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1560 + }, + { + "epoch": 1.172954800149421, + "grad_norm": 0.35816895961761475, + "learning_rate": 0.0002, + "loss": 1.678, + "step": 1570 + }, + { + "epoch": 1.1804258498319014, + "grad_norm": 0.33843839168548584, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1580 + }, + { + "epoch": 1.1878968995143817, + "grad_norm": 0.3371972143650055, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 1590 + }, + { + "epoch": 1.195367949196862, + "grad_norm": 0.36016878485679626, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 1600 + }, + { + "epoch": 1.2028389988793426, + "grad_norm": 0.40879473090171814, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 1610 + }, + { + "epoch": 1.210310048561823, + "grad_norm": 0.3216715455055237, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 1620 + }, + { + "epoch": 1.2177810982443034, + "grad_norm": 0.4482610821723938, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1630 + }, + { + "epoch": 1.2252521479267837, + "grad_norm": 0.3257700502872467, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1640 + }, + { + "epoch": 1.232723197609264, + "grad_norm": 0.38646459579467773, + "learning_rate": 0.0002, + "loss": 1.7177, + "step": 1650 + }, + { + "epoch": 1.2401942472917444, + "grad_norm": 0.4081360697746277, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1660 + }, + { + "epoch": 1.2476652969742248, + "grad_norm": 0.4326848089694977, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 1670 + }, + { + "epoch": 1.2551363466567054, + "grad_norm": 0.346401572227478, + "learning_rate": 0.0002, + "loss": 1.6752, + "step": 1680 + }, + { + "epoch": 1.2626073963391857, + "grad_norm": 0.34536251425743103, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1690 + }, + { + "epoch": 1.270078446021666, + "grad_norm": 0.41359591484069824, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 1700 + }, + { + "epoch": 1.2775494957041464, + "grad_norm": 0.3530874252319336, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 1710 + }, + { + "epoch": 1.2850205453866268, + "grad_norm": 0.3702719211578369, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 1720 + }, + { + "epoch": 1.2924915950691072, + "grad_norm": 0.3703329563140869, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1730 + }, + { + "epoch": 1.2999626447515875, + "grad_norm": 0.37919729948043823, + "learning_rate": 0.0002, + "loss": 1.7221, + "step": 1740 + }, + { + "epoch": 1.307433694434068, + "grad_norm": 0.32526856660842896, + "learning_rate": 0.0002, + "loss": 1.7859, + "step": 1750 + }, + { + "epoch": 1.3149047441165485, + "grad_norm": 0.36752620339393616, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1760 + }, + { + "epoch": 1.3223757937990288, + "grad_norm": 0.3398192524909973, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1770 + }, + { + "epoch": 1.3298468434815092, + "grad_norm": 0.37435585260391235, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1780 + }, + { + "epoch": 1.3373178931639895, + "grad_norm": 0.35793280601501465, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1790 + }, + { + "epoch": 1.3447889428464699, + "grad_norm": 0.35481882095336914, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1800 + }, + { + "epoch": 1.3522599925289502, + "grad_norm": 0.3786393105983734, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1810 + }, + { + "epoch": 1.3597310422114308, + "grad_norm": 0.33245593309402466, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1820 + }, + { + "epoch": 1.3672020918939112, + "grad_norm": 0.35388344526290894, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1830 + }, + { + "epoch": 1.3746731415763915, + "grad_norm": 0.3695325553417206, + "learning_rate": 0.0002, + "loss": 1.6968, + "step": 1840 + }, + { + "epoch": 1.382144191258872, + "grad_norm": 0.3683604598045349, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1850 + }, + { + "epoch": 1.3896152409413522, + "grad_norm": 0.3753012418746948, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1860 + }, + { + "epoch": 1.3970862906238326, + "grad_norm": 0.3331069350242615, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 1870 + }, + { + "epoch": 1.404557340306313, + "grad_norm": 0.3877500295639038, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 1880 + }, + { + "epoch": 1.4120283899887935, + "grad_norm": 0.33525151014328003, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1890 + }, + { + "epoch": 1.4194994396712737, + "grad_norm": 0.3697299659252167, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1900 + }, + { + "epoch": 1.4269704893537543, + "grad_norm": 0.4029286205768585, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1910 + }, + { + "epoch": 1.4344415390362346, + "grad_norm": 0.3596203029155731, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 1920 + }, + { + "epoch": 1.441912588718715, + "grad_norm": 0.450783908367157, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 1930 + }, + { + "epoch": 1.4493836384011953, + "grad_norm": 0.3651481866836548, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1940 + }, + { + "epoch": 1.4568546880836757, + "grad_norm": 0.3608424663543701, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 1950 + }, + { + "epoch": 1.4643257377661563, + "grad_norm": 0.39684420824050903, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 1960 + }, + { + "epoch": 1.4717967874486364, + "grad_norm": 0.34618663787841797, + "learning_rate": 0.0002, + "loss": 1.7514, + "step": 1970 + }, + { + "epoch": 1.479267837131117, + "grad_norm": 0.4150386452674866, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1980 + }, + { + "epoch": 1.4867388868135973, + "grad_norm": 0.35500776767730713, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1990 + }, + { + "epoch": 1.4942099364960777, + "grad_norm": 0.344144344329834, + "learning_rate": 0.0002, + "loss": 1.7322, + "step": 2000 + }, + { + "epoch": 1.501680986178558, + "grad_norm": 0.3340149223804474, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2010 + }, + { + "epoch": 1.5091520358610384, + "grad_norm": 0.37685006856918335, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 2020 + }, + { + "epoch": 1.516623085543519, + "grad_norm": 0.3699876368045807, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 2030 + }, + { + "epoch": 1.5240941352259991, + "grad_norm": 0.3370307385921478, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 2040 + }, + { + "epoch": 1.5315651849084797, + "grad_norm": 0.37780630588531494, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 2050 + }, + { + "epoch": 1.53903623459096, + "grad_norm": 0.370259165763855, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 2060 + }, + { + "epoch": 1.5465072842734404, + "grad_norm": 0.3440011441707611, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 2070 + }, + { + "epoch": 1.5539783339559208, + "grad_norm": 0.40382063388824463, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 2080 + }, + { + "epoch": 1.5614493836384011, + "grad_norm": 0.38002029061317444, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 2090 + }, + { + "epoch": 1.5689204333208817, + "grad_norm": 0.3658451437950134, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2100 + }, + { + "epoch": 1.5763914830033618, + "grad_norm": 0.354842871427536, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 2110 + }, + { + "epoch": 1.5838625326858424, + "grad_norm": 0.34735530614852905, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 2120 + }, + { + "epoch": 1.5913335823683228, + "grad_norm": 0.377581924200058, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 2130 + }, + { + "epoch": 1.5988046320508031, + "grad_norm": 0.41254034638404846, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 2140 + }, + { + "epoch": 1.6062756817332835, + "grad_norm": 0.3630715310573578, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2150 + }, + { + "epoch": 1.6137467314157639, + "grad_norm": 0.36980143189430237, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 2160 + }, + { + "epoch": 1.6212177810982444, + "grad_norm": 0.3634769320487976, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2170 + }, + { + "epoch": 1.6286888307807246, + "grad_norm": 0.3794139623641968, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2180 + }, + { + "epoch": 1.6361598804632052, + "grad_norm": 0.359742134809494, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 2190 + }, + { + "epoch": 1.6436309301456855, + "grad_norm": 0.3770543932914734, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.6511019798281659, + "grad_norm": 0.3797036409378052, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 2210 + }, + { + "epoch": 1.6585730295106462, + "grad_norm": 0.35622093081474304, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 2220 + }, + { + "epoch": 1.6660440791931266, + "grad_norm": 0.34552520513534546, + "learning_rate": 0.0002, + "loss": 1.6615, + "step": 2230 + }, + { + "epoch": 1.6735151288756072, + "grad_norm": 0.379926860332489, + "learning_rate": 0.0002, + "loss": 1.7522, + "step": 2240 + }, + { + "epoch": 1.6809861785580873, + "grad_norm": 0.37083810567855835, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 2250 + }, + { + "epoch": 1.6884572282405679, + "grad_norm": 0.42746543884277344, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 2260 + }, + { + "epoch": 1.6959282779230482, + "grad_norm": 0.3372884690761566, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2270 + }, + { + "epoch": 1.7033993276055286, + "grad_norm": 0.35220256447792053, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2280 + }, + { + "epoch": 1.710870377288009, + "grad_norm": 0.3659130930900574, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 2290 + }, + { + "epoch": 1.7183414269704893, + "grad_norm": 0.37629297375679016, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2300 + }, + { + "epoch": 1.7258124766529699, + "grad_norm": 0.36312398314476013, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2310 + }, + { + "epoch": 1.73328352633545, + "grad_norm": 0.467709481716156, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 2320 + }, + { + "epoch": 1.7407545760179306, + "grad_norm": 0.38685527443885803, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2330 + }, + { + "epoch": 1.748225625700411, + "grad_norm": 0.3578338325023651, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 2340 + }, + { + "epoch": 1.7556966753828913, + "grad_norm": 0.36057502031326294, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2350 + }, + { + "epoch": 1.7631677250653717, + "grad_norm": 0.3615196645259857, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2360 + }, + { + "epoch": 1.770638774747852, + "grad_norm": 0.4118947684764862, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 2370 + }, + { + "epoch": 1.7781098244303326, + "grad_norm": 0.4067276120185852, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2380 + }, + { + "epoch": 1.7855808741128127, + "grad_norm": 0.3979823887348175, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2390 + }, + { + "epoch": 1.7930519237952933, + "grad_norm": 0.44045883417129517, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 2400 + }, + { + "epoch": 1.8005229734777737, + "grad_norm": 0.3998069167137146, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 2410 + }, + { + "epoch": 1.807994023160254, + "grad_norm": 0.3450094759464264, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 2420 + }, + { + "epoch": 1.8154650728427344, + "grad_norm": 0.3759009838104248, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2430 + }, + { + "epoch": 1.8229361225252148, + "grad_norm": 0.34347015619277954, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2440 + }, + { + "epoch": 1.8304071722076953, + "grad_norm": 0.3511228859424591, + "learning_rate": 0.0002, + "loss": 1.7345, + "step": 2450 + }, + { + "epoch": 1.8378782218901755, + "grad_norm": 0.36853715777397156, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 2460 + }, + { + "epoch": 1.845349271572656, + "grad_norm": 0.40659376978874207, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2470 + }, + { + "epoch": 1.8528203212551362, + "grad_norm": 0.39621320366859436, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 2480 + }, + { + "epoch": 1.8602913709376168, + "grad_norm": 0.3753979504108429, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 2490 + }, + { + "epoch": 1.8677624206200971, + "grad_norm": 0.3811938464641571, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2500 + }, + { + "epoch": 1.8752334703025775, + "grad_norm": 0.3432596027851105, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 2510 + }, + { + "epoch": 1.882704519985058, + "grad_norm": 0.3670712113380432, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 2520 + }, + { + "epoch": 1.8901755696675382, + "grad_norm": 0.40907177329063416, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2530 + }, + { + "epoch": 1.8976466193500188, + "grad_norm": 0.3821999728679657, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 2540 + }, + { + "epoch": 1.905117669032499, + "grad_norm": 0.36173978447914124, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 2550 + }, + { + "epoch": 1.9125887187149795, + "grad_norm": 0.38990336656570435, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 2560 + }, + { + "epoch": 1.9200597683974598, + "grad_norm": 0.35242322087287903, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 2570 + }, + { + "epoch": 1.9275308180799402, + "grad_norm": 0.3506428003311157, + "learning_rate": 0.0002, + "loss": 1.7268, + "step": 2580 + }, + { + "epoch": 1.9350018677624208, + "grad_norm": 0.39540135860443115, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2590 + }, + { + "epoch": 1.942472917444901, + "grad_norm": 0.3444725573062897, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2600 + }, + { + "epoch": 1.9499439671273815, + "grad_norm": 0.3963521718978882, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 2610 + }, + { + "epoch": 1.9574150168098616, + "grad_norm": 0.3689815402030945, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2620 + }, + { + "epoch": 1.9648860664923422, + "grad_norm": 0.3482626676559448, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 2630 + }, + { + "epoch": 1.9723571161748226, + "grad_norm": 0.35832616686820984, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2640 + }, + { + "epoch": 1.979828165857303, + "grad_norm": 0.4776208996772766, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2650 + }, + { + "epoch": 1.9872992155397835, + "grad_norm": 0.32570165395736694, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2660 + }, + { + "epoch": 1.9947702652222636, + "grad_norm": 0.3380725085735321, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2670 + }, + { + "epoch": 2.0, + "eval_loss": 1.8046749830245972, + "eval_runtime": 38.5096, + "eval_samples_per_second": 13.373, + "eval_steps_per_second": 1.688, + "step": 2677 + }, + { + "epoch": 2.002241314904744, + "grad_norm": 0.36817631125450134, + "learning_rate": 0.0002, + "loss": 1.7265, + "step": 2680 + }, + { + "epoch": 2.0097123645872244, + "grad_norm": 0.4056456685066223, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2690 + }, + { + "epoch": 2.017183414269705, + "grad_norm": 0.37416863441467285, + "learning_rate": 0.0002, + "loss": 1.5515, + "step": 2700 + }, + { + "epoch": 2.024654463952185, + "grad_norm": 0.4273638427257538, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2710 + }, + { + "epoch": 2.0321255136346656, + "grad_norm": 0.36497923731803894, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2720 + }, + { + "epoch": 2.0395965633171462, + "grad_norm": 0.5021994113922119, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 2730 + }, + { + "epoch": 2.0470676129996264, + "grad_norm": 0.45896220207214355, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 2740 + }, + { + "epoch": 2.054538662682107, + "grad_norm": 0.3973815143108368, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 2750 + }, + { + "epoch": 2.062009712364587, + "grad_norm": 0.4521815776824951, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2760 + }, + { + "epoch": 2.0694807620470677, + "grad_norm": 0.42775002121925354, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2770 + }, + { + "epoch": 2.076951811729548, + "grad_norm": 0.48158586025238037, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 2780 + }, + { + "epoch": 2.0844228614120284, + "grad_norm": 0.4612371623516083, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2790 + }, + { + "epoch": 2.091893911094509, + "grad_norm": 0.42536866664886475, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 2800 + }, + { + "epoch": 2.099364960776989, + "grad_norm": 0.48515772819519043, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 2810 + }, + { + "epoch": 2.1068360104594697, + "grad_norm": 0.41418662667274475, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2820 + }, + { + "epoch": 2.11430706014195, + "grad_norm": 0.4683697819709778, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2830 + }, + { + "epoch": 2.1217781098244304, + "grad_norm": 0.4484657049179077, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2840 + }, + { + "epoch": 2.1292491595069105, + "grad_norm": 0.6621400713920593, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 2850 + }, + { + "epoch": 2.136720209189391, + "grad_norm": 0.45074811577796936, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 2860 + }, + { + "epoch": 2.1441912588718717, + "grad_norm": 0.3513113558292389, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2870 + }, + { + "epoch": 2.151662308554352, + "grad_norm": 0.40411314368247986, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 2880 + }, + { + "epoch": 2.1591333582368324, + "grad_norm": 0.4121065139770508, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 2890 + }, + { + "epoch": 2.1666044079193125, + "grad_norm": 0.44723689556121826, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 2900 + }, + { + "epoch": 2.174075457601793, + "grad_norm": 0.4226122498512268, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 2910 + }, + { + "epoch": 2.1815465072842732, + "grad_norm": 0.46617650985717773, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2920 + }, + { + "epoch": 2.189017556966754, + "grad_norm": 0.4506422281265259, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 2930 + }, + { + "epoch": 2.1964886066492344, + "grad_norm": 0.4892672896385193, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2940 + }, + { + "epoch": 2.2039596563317145, + "grad_norm": 0.44095516204833984, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2950 + }, + { + "epoch": 2.211430706014195, + "grad_norm": 0.41522109508514404, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 2960 + }, + { + "epoch": 2.2189017556966752, + "grad_norm": 0.4860858917236328, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2970 + }, + { + "epoch": 2.226372805379156, + "grad_norm": 0.42662516236305237, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2980 + }, + { + "epoch": 2.233843855061636, + "grad_norm": 0.4390648305416107, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2990 + }, + { + "epoch": 2.2413149047441165, + "grad_norm": 0.47515565156936646, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 3000 + }, + { + "epoch": 2.248785954426597, + "grad_norm": 0.4104543924331665, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 3010 + }, + { + "epoch": 2.2562570041090773, + "grad_norm": 0.4404028654098511, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 3020 + }, + { + "epoch": 2.263728053791558, + "grad_norm": 0.4717366695404053, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3030 + }, + { + "epoch": 2.271199103474038, + "grad_norm": 0.48345857858657837, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 3040 + }, + { + "epoch": 2.2786701531565186, + "grad_norm": 0.5312452912330627, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 3050 + }, + { + "epoch": 2.2861412028389987, + "grad_norm": 0.5073099732398987, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 3060 + }, + { + "epoch": 2.2936122525214793, + "grad_norm": 0.5027463436126709, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 3070 + }, + { + "epoch": 2.30108330220396, + "grad_norm": 0.5436304807662964, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3080 + }, + { + "epoch": 2.30855435188644, + "grad_norm": 0.4701065123081207, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 3090 + }, + { + "epoch": 2.3160254015689206, + "grad_norm": 0.46988746523857117, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 3100 + }, + { + "epoch": 2.3234964512514007, + "grad_norm": 0.45112869143486023, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 3110 + }, + { + "epoch": 2.3309675009338813, + "grad_norm": 0.5173566937446594, + "learning_rate": 0.0002, + "loss": 1.6291, + "step": 3120 + }, + { + "epoch": 2.3384385506163614, + "grad_norm": 0.40345850586891174, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 3130 + }, + { + "epoch": 2.345909600298842, + "grad_norm": 0.4218924939632416, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3140 + }, + { + "epoch": 2.3533806499813226, + "grad_norm": 0.41857317090034485, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 3150 + }, + { + "epoch": 2.3608516996638027, + "grad_norm": 0.4197218418121338, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 3160 + }, + { + "epoch": 2.3683227493462833, + "grad_norm": 0.4260677397251129, + "learning_rate": 0.0002, + "loss": 1.6572, + "step": 3170 + }, + { + "epoch": 2.3757937990287634, + "grad_norm": 0.4209042191505432, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3180 + }, + { + "epoch": 2.383264848711244, + "grad_norm": 0.4092234969139099, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3190 + }, + { + "epoch": 2.390735898393724, + "grad_norm": 0.4928431510925293, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 3200 + }, + { + "epoch": 2.3982069480762047, + "grad_norm": 0.49252402782440186, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3210 + }, + { + "epoch": 2.4056779977586853, + "grad_norm": 0.4368397295475006, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3220 + }, + { + "epoch": 2.4131490474411654, + "grad_norm": 0.46122390031814575, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 3230 + }, + { + "epoch": 2.420620097123646, + "grad_norm": 0.4272301197052002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.428091146806126, + "grad_norm": 0.41480937600135803, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 3250 + }, + { + "epoch": 2.4355621964886067, + "grad_norm": 0.48911941051483154, + "learning_rate": 0.0002, + "loss": 1.6281, + "step": 3260 + }, + { + "epoch": 2.443033246171087, + "grad_norm": 0.4444098472595215, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 3270 + }, + { + "epoch": 2.4505042958535674, + "grad_norm": 0.5111684799194336, + "learning_rate": 0.0002, + "loss": 1.6961, + "step": 3280 + }, + { + "epoch": 2.457975345536048, + "grad_norm": 0.5058825016021729, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 3290 + }, + { + "epoch": 2.465446395218528, + "grad_norm": 0.44173210859298706, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3300 + }, + { + "epoch": 2.4729174449010087, + "grad_norm": 0.4659745991230011, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 3310 + }, + { + "epoch": 2.480388494583489, + "grad_norm": 0.47237497568130493, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3320 + }, + { + "epoch": 2.4878595442659694, + "grad_norm": 0.47303131222724915, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 3330 + }, + { + "epoch": 2.4953305939484496, + "grad_norm": 0.4522389769554138, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 3340 + }, + { + "epoch": 2.50280164363093, + "grad_norm": 0.4467332363128662, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3350 + }, + { + "epoch": 2.5102726933134107, + "grad_norm": 0.4413762092590332, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3360 + }, + { + "epoch": 2.517743742995891, + "grad_norm": 0.495514452457428, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 3370 + }, + { + "epoch": 2.5252147926783715, + "grad_norm": 0.4429773986339569, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 3380 + }, + { + "epoch": 2.5326858423608516, + "grad_norm": 0.4589079022407532, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3390 + }, + { + "epoch": 2.540156892043332, + "grad_norm": 0.4683997333049774, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 3400 + }, + { + "epoch": 2.5476279417258123, + "grad_norm": 0.4651731252670288, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 3410 + }, + { + "epoch": 2.555098991408293, + "grad_norm": 0.45818084478378296, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3420 + }, + { + "epoch": 2.5625700410907735, + "grad_norm": 0.45209529995918274, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.5700410907732536, + "grad_norm": 0.4344733655452728, + "learning_rate": 0.0002, + "loss": 1.5606, + "step": 3440 + }, + { + "epoch": 2.577512140455734, + "grad_norm": 0.47435566782951355, + "learning_rate": 0.0002, + "loss": 1.6748, + "step": 3450 + }, + { + "epoch": 2.5849831901382143, + "grad_norm": 0.43841999769210815, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 3460 + }, + { + "epoch": 2.592454239820695, + "grad_norm": 0.4323869049549103, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 3470 + }, + { + "epoch": 2.599925289503175, + "grad_norm": 0.44355881214141846, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 3480 + }, + { + "epoch": 2.6073963391856556, + "grad_norm": 0.45847779512405396, + "learning_rate": 0.0002, + "loss": 1.665, + "step": 3490 + }, + { + "epoch": 2.614867388868136, + "grad_norm": 0.4411061704158783, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3500 + }, + { + "epoch": 2.6223384385506163, + "grad_norm": 0.4446796178817749, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3510 + }, + { + "epoch": 2.629809488233097, + "grad_norm": 0.41969653964042664, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3520 + }, + { + "epoch": 2.637280537915577, + "grad_norm": 0.5263747572898865, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 3530 + }, + { + "epoch": 2.6447515875980576, + "grad_norm": 0.47719451785087585, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3540 + }, + { + "epoch": 2.6522226372805378, + "grad_norm": 0.46574118733406067, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 3550 + }, + { + "epoch": 2.6596936869630183, + "grad_norm": 0.46867135167121887, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 3560 + }, + { + "epoch": 2.667164736645499, + "grad_norm": 0.4441198706626892, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3570 + }, + { + "epoch": 2.674635786327979, + "grad_norm": 0.4871319830417633, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3580 + }, + { + "epoch": 2.6821068360104596, + "grad_norm": 0.43900373578071594, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 3590 + }, + { + "epoch": 2.6895778856929398, + "grad_norm": 0.42509549856185913, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 3600 + }, + { + "epoch": 2.6970489353754203, + "grad_norm": 0.4691086709499359, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 3610 + }, + { + "epoch": 2.7045199850579005, + "grad_norm": 0.46318942308425903, + "learning_rate": 0.0002, + "loss": 1.5491, + "step": 3620 + }, + { + "epoch": 2.711991034740381, + "grad_norm": 0.44631096720695496, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3630 + }, + { + "epoch": 2.7194620844228616, + "grad_norm": 0.42315489053726196, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3640 + }, + { + "epoch": 2.7269331341053418, + "grad_norm": 0.4971241056919098, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3650 + }, + { + "epoch": 2.7344041837878224, + "grad_norm": 0.4578486382961273, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 3660 + }, + { + "epoch": 2.7418752334703025, + "grad_norm": 0.46584776043891907, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3670 + }, + { + "epoch": 2.749346283152783, + "grad_norm": 0.4951731264591217, + "learning_rate": 0.0002, + "loss": 1.6809, + "step": 3680 + }, + { + "epoch": 2.756817332835263, + "grad_norm": 0.4935225546360016, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 3690 + }, + { + "epoch": 2.764288382517744, + "grad_norm": 0.41805586218833923, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3700 + }, + { + "epoch": 2.7717594322002244, + "grad_norm": 0.4417555630207062, + "learning_rate": 0.0002, + "loss": 1.7173, + "step": 3710 + }, + { + "epoch": 2.7792304818827045, + "grad_norm": 0.48229655623435974, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 3720 + }, + { + "epoch": 2.786701531565185, + "grad_norm": 0.48562315106391907, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3730 + }, + { + "epoch": 2.794172581247665, + "grad_norm": 0.4473940432071686, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 3740 + }, + { + "epoch": 2.801643630930146, + "grad_norm": 0.4626813232898712, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3750 + }, + { + "epoch": 2.809114680612626, + "grad_norm": 0.4339792728424072, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 3760 + }, + { + "epoch": 2.8165857302951065, + "grad_norm": 0.5250858068466187, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 3770 + }, + { + "epoch": 2.824056779977587, + "grad_norm": 0.4537523090839386, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3780 + }, + { + "epoch": 2.831527829660067, + "grad_norm": 0.5646113157272339, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3790 + }, + { + "epoch": 2.8389988793425474, + "grad_norm": 0.44243332743644714, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 3800 + }, + { + "epoch": 2.846469929025028, + "grad_norm": 0.4585791826248169, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3810 + }, + { + "epoch": 2.8539409787075085, + "grad_norm": 0.489702045917511, + "learning_rate": 0.0002, + "loss": 1.6854, + "step": 3820 + }, + { + "epoch": 2.8614120283899886, + "grad_norm": 0.502470850944519, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 3830 + }, + { + "epoch": 2.8688830780724692, + "grad_norm": 0.4395960867404938, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3840 + }, + { + "epoch": 2.87635412775495, + "grad_norm": 0.4348670244216919, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3850 + }, + { + "epoch": 2.88382517743743, + "grad_norm": 0.48852720856666565, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3860 + }, + { + "epoch": 2.89129622711991, + "grad_norm": 0.45317450165748596, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3870 + }, + { + "epoch": 2.8987672768023907, + "grad_norm": 0.4732758700847626, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3880 + }, + { + "epoch": 2.9062383264848712, + "grad_norm": 0.45238012075424194, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3890 + }, + { + "epoch": 2.9137093761673514, + "grad_norm": 0.48838064074516296, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 3900 + }, + { + "epoch": 2.921180425849832, + "grad_norm": 0.43496349453926086, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 3910 + }, + { + "epoch": 2.9286514755323125, + "grad_norm": 0.47963935136795044, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 3920 + }, + { + "epoch": 2.9361225252147927, + "grad_norm": 0.4544987976551056, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 3930 + }, + { + "epoch": 2.943593574897273, + "grad_norm": 0.4622892141342163, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 3940 + }, + { + "epoch": 2.9510646245797534, + "grad_norm": 0.47026222944259644, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 3950 + }, + { + "epoch": 2.958535674262234, + "grad_norm": 0.4549552798271179, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 3960 + }, + { + "epoch": 2.966006723944714, + "grad_norm": 0.46647515892982483, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3970 + }, + { + "epoch": 2.9734777736271947, + "grad_norm": 0.45095112919807434, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 3980 + }, + { + "epoch": 2.9809488233096753, + "grad_norm": 0.4690017104148865, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 3990 + }, + { + "epoch": 2.9884198729921554, + "grad_norm": 0.4603444039821625, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 4000 + }, + { + "epoch": 2.9958909226746355, + "grad_norm": 0.4743294417858124, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 4010 + }, + { + "epoch": 2.999626447515876, + "eval_loss": 1.8252571821212769, + "eval_runtime": 38.7853, + "eval_samples_per_second": 13.278, + "eval_steps_per_second": 1.676, + "step": 4015 + }, + { + "epoch": 3.003361972357116, + "grad_norm": 0.4919724464416504, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 4020 + }, + { + "epoch": 3.0108330220395967, + "grad_norm": 0.4747185707092285, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4030 + }, + { + "epoch": 3.018304071722077, + "grad_norm": 0.4797595143318176, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 4040 + }, + { + "epoch": 3.0257751214045574, + "grad_norm": 0.5450999140739441, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 4050 + }, + { + "epoch": 3.0332461710870375, + "grad_norm": 0.49058812856674194, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4060 + }, + { + "epoch": 3.040717220769518, + "grad_norm": 0.5219563841819763, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4070 + }, + { + "epoch": 3.0481882704519987, + "grad_norm": 0.515628457069397, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 4080 + }, + { + "epoch": 3.055659320134479, + "grad_norm": 0.6145984530448914, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 4090 + }, + { + "epoch": 3.0631303698169594, + "grad_norm": 0.6067144274711609, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 4100 + }, + { + "epoch": 3.0706014194994395, + "grad_norm": 0.5773133039474487, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4110 + }, + { + "epoch": 3.07807246918192, + "grad_norm": 0.6894241571426392, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 4120 + }, + { + "epoch": 3.0855435188644003, + "grad_norm": 0.6422514915466309, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4130 + }, + { + "epoch": 3.093014568546881, + "grad_norm": 0.6119855046272278, + "learning_rate": 0.0002, + "loss": 1.4724, + "step": 4140 + }, + { + "epoch": 3.1004856182293614, + "grad_norm": 0.5847280025482178, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 4150 + }, + { + "epoch": 3.1079566679118416, + "grad_norm": 0.5401515960693359, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4160 + }, + { + "epoch": 3.115427717594322, + "grad_norm": 0.6501587629318237, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 4170 + }, + { + "epoch": 3.1228987672768023, + "grad_norm": 0.5988039374351501, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 4180 + }, + { + "epoch": 3.130369816959283, + "grad_norm": 0.4982665181159973, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 4190 + }, + { + "epoch": 3.137840866641763, + "grad_norm": 0.5548039078712463, + "learning_rate": 0.0002, + "loss": 1.5078, + "step": 4200 + }, + { + "epoch": 3.1453119163242436, + "grad_norm": 0.5920777320861816, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 4210 + }, + { + "epoch": 3.152782966006724, + "grad_norm": 0.6965190172195435, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 4220 + }, + { + "epoch": 3.1602540156892043, + "grad_norm": 0.5196244716644287, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4230 + }, + { + "epoch": 3.167725065371685, + "grad_norm": 0.6942682266235352, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 4240 + }, + { + "epoch": 3.175196115054165, + "grad_norm": 0.5765156149864197, + "learning_rate": 0.0002, + "loss": 1.5407, + "step": 4250 + }, + { + "epoch": 3.1826671647366456, + "grad_norm": 0.5801976919174194, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 4260 + }, + { + "epoch": 3.1901382144191257, + "grad_norm": 0.6260752081871033, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4270 + }, + { + "epoch": 3.1976092641016063, + "grad_norm": 0.6610770225524902, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 4280 + }, + { + "epoch": 3.205080313784087, + "grad_norm": 0.5762143135070801, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 4290 + }, + { + "epoch": 3.212551363466567, + "grad_norm": 0.5926990509033203, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 4300 + }, + { + "epoch": 3.2200224131490476, + "grad_norm": 0.7373854517936707, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 4310 + }, + { + "epoch": 3.2274934628315277, + "grad_norm": 0.5963311195373535, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 4320 + }, + { + "epoch": 3.2349645125140083, + "grad_norm": 0.5754616856575012, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 4330 + }, + { + "epoch": 3.2424355621964884, + "grad_norm": 0.6116095781326294, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 4340 + }, + { + "epoch": 3.249906611878969, + "grad_norm": 0.6001536846160889, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 4350 + }, + { + "epoch": 3.257377661561449, + "grad_norm": 0.5270227789878845, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 4360 + }, + { + "epoch": 3.2648487112439297, + "grad_norm": 0.6666602492332458, + "learning_rate": 0.0002, + "loss": 1.5235, + "step": 4370 + }, + { + "epoch": 3.2723197609264103, + "grad_norm": 0.520310640335083, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 4380 + }, + { + "epoch": 3.2797908106088904, + "grad_norm": 0.5165975093841553, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 4390 + }, + { + "epoch": 3.287261860291371, + "grad_norm": 0.6080228686332703, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4400 + }, + { + "epoch": 3.294732909973851, + "grad_norm": 0.670122504234314, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 4410 + }, + { + "epoch": 3.3022039596563317, + "grad_norm": 0.6019457578659058, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 4420 + }, + { + "epoch": 3.309675009338812, + "grad_norm": 0.5519300103187561, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 4430 + }, + { + "epoch": 3.3171460590212924, + "grad_norm": 0.5958521962165833, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 4440 + }, + { + "epoch": 3.324617108703773, + "grad_norm": 0.5552705526351929, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4450 + }, + { + "epoch": 3.332088158386253, + "grad_norm": 0.6583784818649292, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 4460 + }, + { + "epoch": 3.3395592080687337, + "grad_norm": 0.5815939903259277, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4470 + }, + { + "epoch": 3.347030257751214, + "grad_norm": 1.3342205286026, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 4480 + }, + { + "epoch": 3.3545013074336945, + "grad_norm": 0.6341500878334045, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 4490 + }, + { + "epoch": 3.3619723571161746, + "grad_norm": 0.6384079456329346, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 4500 + }, + { + "epoch": 3.369443406798655, + "grad_norm": 0.6098346710205078, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 4510 + }, + { + "epoch": 3.3769144564811358, + "grad_norm": 0.5958296656608582, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4520 + }, + { + "epoch": 3.384385506163616, + "grad_norm": 0.6157881617546082, + "learning_rate": 0.0002, + "loss": 1.5171, + "step": 4530 + }, + { + "epoch": 3.3918565558460965, + "grad_norm": 0.5671007037162781, + "learning_rate": 0.0002, + "loss": 1.569, + "step": 4540 + }, + { + "epoch": 3.3993276055285766, + "grad_norm": 0.6203294992446899, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 4550 + }, + { + "epoch": 3.406798655211057, + "grad_norm": 0.6743317246437073, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 4560 + }, + { + "epoch": 3.4142697048935373, + "grad_norm": 0.731765627861023, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4570 + }, + { + "epoch": 3.421740754576018, + "grad_norm": 0.6285187602043152, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 4580 + }, + { + "epoch": 3.4292118042584985, + "grad_norm": 0.612680196762085, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 4590 + }, + { + "epoch": 3.4366828539409786, + "grad_norm": 0.6413681507110596, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 4600 + }, + { + "epoch": 3.444153903623459, + "grad_norm": 0.6240990161895752, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4610 + }, + { + "epoch": 3.4516249533059393, + "grad_norm": 0.5095735192298889, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4620 + }, + { + "epoch": 3.45909600298842, + "grad_norm": 0.5699611902236938, + "learning_rate": 0.0002, + "loss": 1.4906, + "step": 4630 + }, + { + "epoch": 3.4665670526709, + "grad_norm": 0.7289775609970093, + "learning_rate": 0.0002, + "loss": 1.5176, + "step": 4640 + }, + { + "epoch": 3.4740381023533806, + "grad_norm": 0.6211609840393066, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 4650 + }, + { + "epoch": 3.481509152035861, + "grad_norm": 0.5714802145957947, + "learning_rate": 0.0002, + "loss": 1.533, + "step": 4660 + }, + { + "epoch": 3.4889802017183413, + "grad_norm": 0.6287049651145935, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 4670 + }, + { + "epoch": 3.496451251400822, + "grad_norm": 0.5480595827102661, + "learning_rate": 0.0002, + "loss": 1.4212, + "step": 4680 + }, + { + "epoch": 3.503922301083302, + "grad_norm": 0.5683253407478333, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4690 + }, + { + "epoch": 3.5113933507657826, + "grad_norm": 0.601140558719635, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4700 + }, + { + "epoch": 3.5188644004482628, + "grad_norm": 0.5344498157501221, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 4710 + }, + { + "epoch": 3.5263354501307433, + "grad_norm": 0.5739690661430359, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4720 + }, + { + "epoch": 3.533806499813224, + "grad_norm": 0.5640085935592651, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 4730 + }, + { + "epoch": 3.541277549495704, + "grad_norm": 0.5967805981636047, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 4740 + }, + { + "epoch": 3.5487485991781846, + "grad_norm": 0.6138835549354553, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4750 + }, + { + "epoch": 3.5562196488606648, + "grad_norm": 0.6779900193214417, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 4760 + }, + { + "epoch": 3.5636906985431454, + "grad_norm": 0.6122010350227356, + "learning_rate": 0.0002, + "loss": 1.4917, + "step": 4770 + }, + { + "epoch": 3.5711617482256255, + "grad_norm": 0.5685241222381592, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4780 + }, + { + "epoch": 3.578632797908106, + "grad_norm": 0.604583203792572, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 4790 + }, + { + "epoch": 3.5861038475905866, + "grad_norm": 0.651165246963501, + "learning_rate": 0.0002, + "loss": 1.4514, + "step": 4800 + }, + { + "epoch": 3.593574897273067, + "grad_norm": 0.6398511528968811, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 4810 + }, + { + "epoch": 3.6010459469555474, + "grad_norm": 0.6444641351699829, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4820 + }, + { + "epoch": 3.6085169966380275, + "grad_norm": 0.6018481850624084, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 4830 + }, + { + "epoch": 3.615988046320508, + "grad_norm": 0.6025291085243225, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 4840 + }, + { + "epoch": 3.623459096002988, + "grad_norm": 0.6810156106948853, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 4850 + }, + { + "epoch": 3.630930145685469, + "grad_norm": 0.6408044695854187, + "learning_rate": 0.0002, + "loss": 1.5299, + "step": 4860 + }, + { + "epoch": 3.6384011953679494, + "grad_norm": 0.5608272552490234, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4870 + }, + { + "epoch": 3.6458722450504295, + "grad_norm": 0.6136814951896667, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 4880 + }, + { + "epoch": 3.65334329473291, + "grad_norm": 0.5927900075912476, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4890 + }, + { + "epoch": 3.66081434441539, + "grad_norm": 0.5336901545524597, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 4900 + }, + { + "epoch": 3.668285394097871, + "grad_norm": 0.7823320627212524, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 4910 + }, + { + "epoch": 3.675756443780351, + "grad_norm": 0.6703504323959351, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 4920 + }, + { + "epoch": 3.6832274934628315, + "grad_norm": 0.6061160564422607, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 4930 + }, + { + "epoch": 3.690698543145312, + "grad_norm": 0.6237227916717529, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4940 + }, + { + "epoch": 3.6981695928277922, + "grad_norm": 0.5985278487205505, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 4950 + }, + { + "epoch": 3.705640642510273, + "grad_norm": 0.6483839750289917, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 4960 + }, + { + "epoch": 3.713111692192753, + "grad_norm": 0.5788805484771729, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 4970 + }, + { + "epoch": 3.7205827418752335, + "grad_norm": 0.5609974265098572, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 4980 + }, + { + "epoch": 3.7280537915577137, + "grad_norm": 0.5681300759315491, + "learning_rate": 0.0002, + "loss": 1.4759, + "step": 4990 + }, + { + "epoch": 3.7355248412401942, + "grad_norm": 0.5860186219215393, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 5000 + }, + { + "epoch": 3.742995890922675, + "grad_norm": 0.5718157291412354, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 5010 + }, + { + "epoch": 3.750466940605155, + "grad_norm": 0.6173721551895142, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 5020 + }, + { + "epoch": 3.7579379902876355, + "grad_norm": 0.629152238368988, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 5030 + }, + { + "epoch": 3.7654090399701157, + "grad_norm": 0.5666284561157227, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 5040 + }, + { + "epoch": 3.7728800896525962, + "grad_norm": 0.6053005456924438, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5050 + }, + { + "epoch": 3.7803511393350764, + "grad_norm": 0.5870583057403564, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 5060 + }, + { + "epoch": 3.787822189017557, + "grad_norm": 0.5422009229660034, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5070 + }, + { + "epoch": 3.7952932387000375, + "grad_norm": 0.5396918058395386, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 5080 + }, + { + "epoch": 3.8027642883825177, + "grad_norm": 0.5544713139533997, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 5090 + }, + { + "epoch": 3.8102353380649983, + "grad_norm": 0.5983749628067017, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5100 + }, + { + "epoch": 3.8177063877474784, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 5110 + }, + { + "epoch": 3.825177437429959, + "grad_norm": 0.5436882376670837, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 5120 + }, + { + "epoch": 3.832648487112439, + "grad_norm": 0.5453617572784424, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 5130 + }, + { + "epoch": 3.8401195367949197, + "grad_norm": 0.6269069314002991, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 5140 + }, + { + "epoch": 3.8475905864774003, + "grad_norm": 0.6189185380935669, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 5150 + }, + { + "epoch": 3.8550616361598804, + "grad_norm": 0.6653388142585754, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 5160 + }, + { + "epoch": 3.862532685842361, + "grad_norm": 0.5771768689155579, + "learning_rate": 0.0002, + "loss": 1.5075, + "step": 5170 + }, + { + "epoch": 3.870003735524841, + "grad_norm": 0.6052790880203247, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5180 + }, + { + "epoch": 3.8774747852073217, + "grad_norm": 0.6572316884994507, + "learning_rate": 0.0002, + "loss": 1.4987, + "step": 5190 + }, + { + "epoch": 3.884945834889802, + "grad_norm": 0.670576810836792, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 5200 + }, + { + "epoch": 3.8924168845722824, + "grad_norm": 0.5728798508644104, + "learning_rate": 0.0002, + "loss": 1.4777, + "step": 5210 + }, + { + "epoch": 3.899887934254763, + "grad_norm": 0.6340774297714233, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 5220 + }, + { + "epoch": 3.907358983937243, + "grad_norm": 0.5981315970420837, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 5230 + }, + { + "epoch": 3.9148300336197237, + "grad_norm": 0.6212025880813599, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 5240 + }, + { + "epoch": 3.922301083302204, + "grad_norm": 0.6202296018600464, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5250 + }, + { + "epoch": 3.9297721329846844, + "grad_norm": 0.6159142255783081, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 5260 + }, + { + "epoch": 3.9372431826671646, + "grad_norm": 0.6519438624382019, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 5270 + }, + { + "epoch": 3.944714232349645, + "grad_norm": 0.539813756942749, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5280 + }, + { + "epoch": 3.9521852820321257, + "grad_norm": 0.6443665027618408, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 5290 + }, + { + "epoch": 3.959656331714606, + "grad_norm": 0.6635757684707642, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 5300 + }, + { + "epoch": 3.9671273813970864, + "grad_norm": 0.589363157749176, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 5310 + }, + { + "epoch": 3.9745984310795666, + "grad_norm": 0.5788735747337341, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 5320 + }, + { + "epoch": 3.982069480762047, + "grad_norm": 0.5976864695549011, + "learning_rate": 0.0002, + "loss": 1.5607, + "step": 5330 + }, + { + "epoch": 3.9895405304445273, + "grad_norm": 0.6624067425727844, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 5340 + }, + { + "epoch": 3.997011580127008, + "grad_norm": 0.6738956570625305, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 5350 + }, + { + "epoch": 4.0, + "eval_loss": 1.868006944656372, + "eval_runtime": 38.5153, + "eval_samples_per_second": 13.371, + "eval_steps_per_second": 1.688, + "step": 5354 + }, + { + "epoch": 4.004482629809488, + "grad_norm": 0.6023468971252441, + "learning_rate": 0.0002, + "loss": 1.4535, + "step": 5360 + }, + { + "epoch": 4.011953679491969, + "grad_norm": 0.8589285612106323, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 5370 + }, + { + "epoch": 4.019424729174449, + "grad_norm": 0.7477491497993469, + "learning_rate": 0.0002, + "loss": 1.3952, + "step": 5380 + }, + { + "epoch": 4.02689577885693, + "grad_norm": 0.7601922154426575, + "learning_rate": 0.0002, + "loss": 1.3745, + "step": 5390 + }, + { + "epoch": 4.03436682853941, + "grad_norm": 0.8115614056587219, + "learning_rate": 0.0002, + "loss": 1.4133, + "step": 5400 + }, + { + "epoch": 4.04183787822189, + "grad_norm": 0.669925332069397, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 5410 + }, + { + "epoch": 4.04930892790437, + "grad_norm": 0.8091904520988464, + "learning_rate": 0.0002, + "loss": 1.2835, + "step": 5420 + }, + { + "epoch": 4.056779977586851, + "grad_norm": 0.709405779838562, + "learning_rate": 0.0002, + "loss": 1.3615, + "step": 5430 + }, + { + "epoch": 4.064251027269331, + "grad_norm": 1.0006179809570312, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 5440 + }, + { + "epoch": 4.071722076951811, + "grad_norm": 0.7017965912818909, + "learning_rate": 0.0002, + "loss": 1.3491, + "step": 5450 + }, + { + "epoch": 4.0791931266342925, + "grad_norm": 0.8991572260856628, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 5460 + }, + { + "epoch": 4.086664176316773, + "grad_norm": 0.9064797759056091, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 5470 + }, + { + "epoch": 4.094135225999253, + "grad_norm": 0.7981749176979065, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 5480 + }, + { + "epoch": 4.101606275681733, + "grad_norm": 0.7280883193016052, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 5490 + }, + { + "epoch": 4.109077325364214, + "grad_norm": 0.7419600486755371, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 5500 + }, + { + "epoch": 4.116548375046694, + "grad_norm": 0.8019949197769165, + "learning_rate": 0.0002, + "loss": 1.3199, + "step": 5510 + }, + { + "epoch": 4.124019424729174, + "grad_norm": 0.7501229047775269, + "learning_rate": 0.0002, + "loss": 1.3133, + "step": 5520 + }, + { + "epoch": 4.131490474411655, + "grad_norm": 0.8166249990463257, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 5530 + }, + { + "epoch": 4.138961524094135, + "grad_norm": 0.9728496074676514, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 5540 + }, + { + "epoch": 4.1464325737766154, + "grad_norm": 0.7590922117233276, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 5550 + }, + { + "epoch": 4.153903623459096, + "grad_norm": 0.7759010791778564, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 5560 + }, + { + "epoch": 4.161374673141577, + "grad_norm": 0.9057986736297607, + "learning_rate": 0.0002, + "loss": 1.3635, + "step": 5570 + }, + { + "epoch": 4.168845722824057, + "grad_norm": 0.8853937983512878, + "learning_rate": 0.0002, + "loss": 1.4152, + "step": 5580 + }, + { + "epoch": 4.176316772506537, + "grad_norm": 0.7070684432983398, + "learning_rate": 0.0002, + "loss": 1.3633, + "step": 5590 + }, + { + "epoch": 4.183787822189018, + "grad_norm": 0.7649410963058472, + "learning_rate": 0.0002, + "loss": 1.3218, + "step": 5600 + }, + { + "epoch": 4.191258871871498, + "grad_norm": 1.2048029899597168, + "learning_rate": 0.0002, + "loss": 1.3857, + "step": 5610 + }, + { + "epoch": 4.198729921553978, + "grad_norm": 0.7986605763435364, + "learning_rate": 0.0002, + "loss": 1.3629, + "step": 5620 + }, + { + "epoch": 4.206200971236458, + "grad_norm": 0.8151885867118835, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 5630 + }, + { + "epoch": 4.213672020918939, + "grad_norm": 0.7719064354896545, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 5640 + }, + { + "epoch": 4.2211430706014195, + "grad_norm": 0.8422448039054871, + "learning_rate": 0.0002, + "loss": 1.3852, + "step": 5650 + }, + { + "epoch": 4.2286141202839, + "grad_norm": 0.7017164826393127, + "learning_rate": 0.0002, + "loss": 1.3321, + "step": 5660 + }, + { + "epoch": 4.236085169966381, + "grad_norm": 0.8559677600860596, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 5670 + }, + { + "epoch": 4.243556219648861, + "grad_norm": 0.8216157555580139, + "learning_rate": 0.0002, + "loss": 1.3701, + "step": 5680 + }, + { + "epoch": 4.251027269331341, + "grad_norm": 0.7681755423545837, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 5690 + }, + { + "epoch": 4.258498319013821, + "grad_norm": 0.811665952205658, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 5700 + }, + { + "epoch": 4.265969368696302, + "grad_norm": 0.7242204546928406, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 5710 + }, + { + "epoch": 4.273440418378782, + "grad_norm": 0.7570181488990784, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 5720 + }, + { + "epoch": 4.280911468061262, + "grad_norm": 0.8951969146728516, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 5730 + }, + { + "epoch": 4.288382517743743, + "grad_norm": 0.7222902178764343, + "learning_rate": 0.0002, + "loss": 1.3895, + "step": 5740 + }, + { + "epoch": 4.2958535674262235, + "grad_norm": 0.8508469462394714, + "learning_rate": 0.0002, + "loss": 1.4155, + "step": 5750 + }, + { + "epoch": 4.303324617108704, + "grad_norm": 0.7215430736541748, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 5760 + }, + { + "epoch": 4.310795666791184, + "grad_norm": 0.8774884939193726, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 5770 + }, + { + "epoch": 4.318266716473665, + "grad_norm": 0.8354552984237671, + "learning_rate": 0.0002, + "loss": 1.427, + "step": 5780 + }, + { + "epoch": 4.325737766156145, + "grad_norm": 0.6938814520835876, + "learning_rate": 0.0002, + "loss": 1.3222, + "step": 5790 + }, + { + "epoch": 4.333208815838625, + "grad_norm": 0.78675377368927, + "learning_rate": 0.0002, + "loss": 1.3589, + "step": 5800 + }, + { + "epoch": 4.340679865521106, + "grad_norm": 0.7147697806358337, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 5810 + }, + { + "epoch": 4.348150915203586, + "grad_norm": 0.7693623304367065, + "learning_rate": 0.0002, + "loss": 1.3597, + "step": 5820 + }, + { + "epoch": 4.355621964886066, + "grad_norm": 0.856517493724823, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 5830 + }, + { + "epoch": 4.3630930145685465, + "grad_norm": 0.7200973033905029, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 5840 + }, + { + "epoch": 4.3705640642510275, + "grad_norm": 0.743281364440918, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 5850 + }, + { + "epoch": 4.378035113933508, + "grad_norm": 0.7627727389335632, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 5860 + }, + { + "epoch": 4.385506163615988, + "grad_norm": 0.7238836884498596, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 5870 + }, + { + "epoch": 4.392977213298469, + "grad_norm": 0.7253410816192627, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5880 + }, + { + "epoch": 4.400448262980949, + "grad_norm": 0.8232238292694092, + "learning_rate": 0.0002, + "loss": 1.3774, + "step": 5890 + }, + { + "epoch": 4.407919312663429, + "grad_norm": 0.8778504729270935, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 5900 + }, + { + "epoch": 4.415390362345909, + "grad_norm": 0.7639474868774414, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5910 + }, + { + "epoch": 4.42286141202839, + "grad_norm": 0.7666519284248352, + "learning_rate": 0.0002, + "loss": 1.3862, + "step": 5920 + }, + { + "epoch": 4.43033246171087, + "grad_norm": 0.867132842540741, + "learning_rate": 0.0002, + "loss": 1.4168, + "step": 5930 + }, + { + "epoch": 4.4378035113933505, + "grad_norm": 0.7571166753768921, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 5940 + }, + { + "epoch": 4.4452745610758315, + "grad_norm": 0.7911370992660522, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 5950 + }, + { + "epoch": 4.452745610758312, + "grad_norm": 0.8844250440597534, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 5960 + }, + { + "epoch": 4.460216660440792, + "grad_norm": 0.7336231470108032, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 5970 + }, + { + "epoch": 4.467687710123272, + "grad_norm": 0.8162738084793091, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 5980 + }, + { + "epoch": 4.475158759805753, + "grad_norm": 0.7413017153739929, + "learning_rate": 0.0002, + "loss": 1.393, + "step": 5990 + }, + { + "epoch": 4.482629809488233, + "grad_norm": 0.7215432524681091, + "learning_rate": 0.0002, + "loss": 1.3712, + "step": 6000 + }, + { + "epoch": 4.490100859170713, + "grad_norm": 0.8943389058113098, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 6010 + }, + { + "epoch": 4.497571908853194, + "grad_norm": 0.7850823998451233, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 6020 + }, + { + "epoch": 4.505042958535674, + "grad_norm": 0.8117504119873047, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 6030 + }, + { + "epoch": 4.5125140082181545, + "grad_norm": 0.8381605744361877, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 6040 + }, + { + "epoch": 4.519985057900635, + "grad_norm": 0.7964059710502625, + "learning_rate": 0.0002, + "loss": 1.3829, + "step": 6050 + }, + { + "epoch": 4.527456107583116, + "grad_norm": 0.7935128211975098, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 6060 + }, + { + "epoch": 4.534927157265596, + "grad_norm": 0.8725124597549438, + "learning_rate": 0.0002, + "loss": 1.3994, + "step": 6070 + }, + { + "epoch": 4.542398206948076, + "grad_norm": 0.880325198173523, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6080 + }, + { + "epoch": 4.549869256630557, + "grad_norm": 0.7220637202262878, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 6090 + }, + { + "epoch": 4.557340306313037, + "grad_norm": 0.6908547878265381, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 6100 + }, + { + "epoch": 4.564811355995517, + "grad_norm": 0.797931969165802, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 6110 + }, + { + "epoch": 4.572282405677997, + "grad_norm": 0.7056134343147278, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 6120 + }, + { + "epoch": 4.579753455360478, + "grad_norm": 0.7850478887557983, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 6130 + }, + { + "epoch": 4.5872245050429585, + "grad_norm": 0.8112621307373047, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 6140 + }, + { + "epoch": 4.594695554725439, + "grad_norm": 0.7040849328041077, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 6150 + }, + { + "epoch": 4.60216660440792, + "grad_norm": 0.7214553952217102, + "learning_rate": 0.0002, + "loss": 1.3526, + "step": 6160 + }, + { + "epoch": 4.6096376540904, + "grad_norm": 0.8616511821746826, + "learning_rate": 0.0002, + "loss": 1.3932, + "step": 6170 + }, + { + "epoch": 4.61710870377288, + "grad_norm": 0.8374658226966858, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 6180 + }, + { + "epoch": 4.62457975345536, + "grad_norm": 0.6761606931686401, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 6190 + }, + { + "epoch": 4.632050803137841, + "grad_norm": 0.768028199672699, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 6200 + }, + { + "epoch": 4.639521852820321, + "grad_norm": 0.9372717142105103, + "learning_rate": 0.0002, + "loss": 1.3772, + "step": 6210 + }, + { + "epoch": 4.646992902502801, + "grad_norm": 0.7906546592712402, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 6220 + }, + { + "epoch": 4.654463952185282, + "grad_norm": 0.7376723289489746, + "learning_rate": 0.0002, + "loss": 1.3962, + "step": 6230 + }, + { + "epoch": 4.6619350018677626, + "grad_norm": 0.8972630500793457, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 6240 + }, + { + "epoch": 4.669406051550243, + "grad_norm": 0.8261756300926208, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 6250 + }, + { + "epoch": 4.676877101232723, + "grad_norm": 0.7512393593788147, + "learning_rate": 0.0002, + "loss": 1.3267, + "step": 6260 + }, + { + "epoch": 4.684348150915204, + "grad_norm": 0.7132362127304077, + "learning_rate": 0.0002, + "loss": 1.4278, + "step": 6270 + }, + { + "epoch": 4.691819200597684, + "grad_norm": 0.7690575122833252, + "learning_rate": 0.0002, + "loss": 1.4299, + "step": 6280 + }, + { + "epoch": 4.699290250280164, + "grad_norm": 0.9886258840560913, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 6290 + }, + { + "epoch": 4.706761299962645, + "grad_norm": 0.9502435922622681, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 6300 + }, + { + "epoch": 4.714232349645125, + "grad_norm": 0.702255129814148, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 6310 + }, + { + "epoch": 4.721703399327605, + "grad_norm": 0.7713103890419006, + "learning_rate": 0.0002, + "loss": 1.4447, + "step": 6320 + }, + { + "epoch": 4.7291744490100855, + "grad_norm": 0.7778580784797668, + "learning_rate": 0.0002, + "loss": 1.4392, + "step": 6330 + }, + { + "epoch": 4.736645498692567, + "grad_norm": 0.7275111079216003, + "learning_rate": 0.0002, + "loss": 1.4169, + "step": 6340 + }, + { + "epoch": 4.744116548375047, + "grad_norm": 0.7728744149208069, + "learning_rate": 0.0002, + "loss": 1.4429, + "step": 6350 + }, + { + "epoch": 4.751587598057527, + "grad_norm": 0.9724260568618774, + "learning_rate": 0.0002, + "loss": 1.3756, + "step": 6360 + }, + { + "epoch": 4.759058647740007, + "grad_norm": 0.7505622506141663, + "learning_rate": 0.0002, + "loss": 1.3358, + "step": 6370 + }, + { + "epoch": 4.766529697422488, + "grad_norm": 0.7994682788848877, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 6380 + }, + { + "epoch": 4.774000747104968, + "grad_norm": 0.8432038426399231, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 6390 + }, + { + "epoch": 4.781471796787448, + "grad_norm": 0.7436022758483887, + "learning_rate": 0.0002, + "loss": 1.4606, + "step": 6400 + }, + { + "epoch": 4.788942846469929, + "grad_norm": 0.7709194421768188, + "learning_rate": 0.0002, + "loss": 1.3461, + "step": 6410 + }, + { + "epoch": 4.796413896152409, + "grad_norm": 0.8798436522483826, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 6420 + }, + { + "epoch": 4.80388494583489, + "grad_norm": 0.790189266204834, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 6430 + }, + { + "epoch": 4.811355995517371, + "grad_norm": 0.6824303865432739, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 6440 + }, + { + "epoch": 4.818827045199851, + "grad_norm": 0.7501044869422913, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 6450 + }, + { + "epoch": 4.826298094882331, + "grad_norm": 0.8840398192405701, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 6460 + }, + { + "epoch": 4.833769144564811, + "grad_norm": 0.7812688946723938, + "learning_rate": 0.0002, + "loss": 1.4412, + "step": 6470 + }, + { + "epoch": 4.841240194247292, + "grad_norm": 0.7429926991462708, + "learning_rate": 0.0002, + "loss": 1.4299, + "step": 6480 + }, + { + "epoch": 4.848711243929772, + "grad_norm": 0.7778021693229675, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 6490 + }, + { + "epoch": 4.856182293612252, + "grad_norm": 0.8270702362060547, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 6500 + }, + { + "epoch": 4.863653343294732, + "grad_norm": 0.6960513591766357, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 6510 + }, + { + "epoch": 4.8711243929772134, + "grad_norm": 0.7728942632675171, + "learning_rate": 0.0002, + "loss": 1.376, + "step": 6520 + }, + { + "epoch": 4.878595442659694, + "grad_norm": 0.7377303838729858, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 6530 + }, + { + "epoch": 4.886066492342174, + "grad_norm": 0.7257253527641296, + "learning_rate": 0.0002, + "loss": 1.3846, + "step": 6540 + }, + { + "epoch": 4.893537542024655, + "grad_norm": 0.7875821590423584, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 6550 + }, + { + "epoch": 4.901008591707135, + "grad_norm": 0.8346304297447205, + "learning_rate": 0.0002, + "loss": 1.357, + "step": 6560 + }, + { + "epoch": 4.908479641389615, + "grad_norm": 0.7710739374160767, + "learning_rate": 0.0002, + "loss": 1.4522, + "step": 6570 + }, + { + "epoch": 4.915950691072096, + "grad_norm": 0.7015138268470764, + "learning_rate": 0.0002, + "loss": 1.4465, + "step": 6580 + }, + { + "epoch": 4.923421740754576, + "grad_norm": 0.8707432150840759, + "learning_rate": 0.0002, + "loss": 1.435, + "step": 6590 + }, + { + "epoch": 4.930892790437056, + "grad_norm": 0.786601185798645, + "learning_rate": 0.0002, + "loss": 1.2968, + "step": 6600 + }, + { + "epoch": 4.938363840119536, + "grad_norm": 0.978519082069397, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 6610 + }, + { + "epoch": 4.9458348898020175, + "grad_norm": 0.8102927207946777, + "learning_rate": 0.0002, + "loss": 1.3997, + "step": 6620 + }, + { + "epoch": 4.953305939484498, + "grad_norm": 0.7628704309463501, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 6630 + }, + { + "epoch": 4.960776989166978, + "grad_norm": 0.8053455352783203, + "learning_rate": 0.0002, + "loss": 1.3774, + "step": 6640 + }, + { + "epoch": 4.968248038849458, + "grad_norm": 0.8680412173271179, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 6650 + }, + { + "epoch": 4.975719088531939, + "grad_norm": 0.7415758371353149, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 6660 + }, + { + "epoch": 4.983190138214419, + "grad_norm": 0.7730312347412109, + "learning_rate": 0.0002, + "loss": 1.3793, + "step": 6670 + }, + { + "epoch": 4.990661187896899, + "grad_norm": 0.7924041152000427, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 6680 + }, + { + "epoch": 4.99813223757938, + "grad_norm": 0.8677893877029419, + "learning_rate": 0.0002, + "loss": 1.4137, + "step": 6690 + }, + { + "epoch": 4.999626447515876, + "eval_loss": 1.9444633722305298, + "eval_runtime": 39.3488, + "eval_samples_per_second": 13.088, + "eval_steps_per_second": 1.652, + "step": 6692 + }, + { + "epoch": 5.00560328726186, + "grad_norm": 0.7102245092391968, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 6700 + }, + { + "epoch": 5.0130743369443405, + "grad_norm": 1.0425463914871216, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 6710 + }, + { + "epoch": 5.0205453866268215, + "grad_norm": 0.9320756793022156, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 6720 + }, + { + "epoch": 5.028016436309302, + "grad_norm": 0.8797217607498169, + "learning_rate": 0.0002, + "loss": 1.1786, + "step": 6730 + }, + { + "epoch": 5.035487485991782, + "grad_norm": 2.135707139968872, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 6740 + }, + { + "epoch": 5.042958535674262, + "grad_norm": 0.8747734427452087, + "learning_rate": 0.0002, + "loss": 1.1761, + "step": 6750 + }, + { + "epoch": 5.050429585356743, + "grad_norm": 0.9981076717376709, + "learning_rate": 0.0002, + "loss": 1.1675, + "step": 6760 + }, + { + "epoch": 5.057900635039223, + "grad_norm": 0.985078752040863, + "learning_rate": 0.0002, + "loss": 1.1976, + "step": 6770 + }, + { + "epoch": 5.065371684721703, + "grad_norm": 1.0974019765853882, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 6780 + }, + { + "epoch": 5.072842734404184, + "grad_norm": 0.9823219180107117, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6790 + }, + { + "epoch": 5.080313784086664, + "grad_norm": 1.122605562210083, + "learning_rate": 0.0002, + "loss": 1.2586, + "step": 6800 + }, + { + "epoch": 5.0877848337691445, + "grad_norm": 0.8556802272796631, + "learning_rate": 0.0002, + "loss": 1.2069, + "step": 6810 + }, + { + "epoch": 5.095255883451625, + "grad_norm": 1.1699262857437134, + "learning_rate": 0.0002, + "loss": 1.1908, + "step": 6820 + }, + { + "epoch": 5.102726933134106, + "grad_norm": 1.0440590381622314, + "learning_rate": 0.0002, + "loss": 1.1869, + "step": 6830 + }, + { + "epoch": 5.110197982816586, + "grad_norm": 1.0445300340652466, + "learning_rate": 0.0002, + "loss": 1.1655, + "step": 6840 + }, + { + "epoch": 5.117669032499066, + "grad_norm": 0.8289563059806824, + "learning_rate": 0.0002, + "loss": 1.2392, + "step": 6850 + }, + { + "epoch": 5.125140082181547, + "grad_norm": 1.1051193475723267, + "learning_rate": 0.0002, + "loss": 1.1687, + "step": 6860 + }, + { + "epoch": 5.132611131864027, + "grad_norm": 0.9345614910125732, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 6870 + }, + { + "epoch": 5.140082181546507, + "grad_norm": 1.1222996711730957, + "learning_rate": 0.0002, + "loss": 1.3021, + "step": 6880 + }, + { + "epoch": 5.147553231228987, + "grad_norm": 0.9405338764190674, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 6890 + }, + { + "epoch": 5.155024280911468, + "grad_norm": 1.0935171842575073, + "learning_rate": 0.0002, + "loss": 1.2367, + "step": 6900 + }, + { + "epoch": 5.1624953305939485, + "grad_norm": 1.0438612699508667, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 6910 + }, + { + "epoch": 5.169966380276429, + "grad_norm": 1.1189004182815552, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6920 + }, + { + "epoch": 5.17743742995891, + "grad_norm": 1.0533215999603271, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 6930 + }, + { + "epoch": 5.18490847964139, + "grad_norm": 0.9779648780822754, + "learning_rate": 0.0002, + "loss": 1.2974, + "step": 6940 + }, + { + "epoch": 5.19237952932387, + "grad_norm": 0.8920868635177612, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 6950 + }, + { + "epoch": 5.19985057900635, + "grad_norm": 0.8374548554420471, + "learning_rate": 0.0002, + "loss": 1.283, + "step": 6960 + }, + { + "epoch": 5.207321628688831, + "grad_norm": 1.0490682125091553, + "learning_rate": 0.0002, + "loss": 1.2775, + "step": 6970 + }, + { + "epoch": 5.214792678371311, + "grad_norm": 0.9658287167549133, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 6980 + }, + { + "epoch": 5.222263728053791, + "grad_norm": 0.9652056097984314, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 6990 + }, + { + "epoch": 5.229734777736272, + "grad_norm": 0.9141794443130493, + "learning_rate": 0.0002, + "loss": 1.3023, + "step": 7000 + }, + { + "epoch": 5.2372058274187525, + "grad_norm": 0.9831376671791077, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 7010 + }, + { + "epoch": 5.244676877101233, + "grad_norm": 1.0198718309402466, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 7020 + }, + { + "epoch": 5.252147926783713, + "grad_norm": 0.9647888541221619, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 7030 + }, + { + "epoch": 5.259618976466194, + "grad_norm": 1.3941649198532104, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 7040 + }, + { + "epoch": 5.267090026148674, + "grad_norm": 1.0305466651916504, + "learning_rate": 0.0002, + "loss": 1.2885, + "step": 7050 + }, + { + "epoch": 5.274561075831154, + "grad_norm": 0.9577859044075012, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 7060 + }, + { + "epoch": 5.282032125513634, + "grad_norm": 1.149092197418213, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 7070 + }, + { + "epoch": 5.289503175196115, + "grad_norm": 1.2582733631134033, + "learning_rate": 0.0002, + "loss": 1.2986, + "step": 7080 + }, + { + "epoch": 5.296974224878595, + "grad_norm": 1.1777442693710327, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 7090 + }, + { + "epoch": 5.3044452745610755, + "grad_norm": 1.0076404809951782, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 7100 + }, + { + "epoch": 5.3119163242435565, + "grad_norm": 0.9037365913391113, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 7110 + }, + { + "epoch": 5.319387373926037, + "grad_norm": 0.9428724646568298, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 7120 + }, + { + "epoch": 5.326858423608517, + "grad_norm": 0.9935154318809509, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 7130 + }, + { + "epoch": 5.334329473290998, + "grad_norm": 1.087500810623169, + "learning_rate": 0.0002, + "loss": 1.2833, + "step": 7140 + }, + { + "epoch": 5.341800522973478, + "grad_norm": 0.8543072938919067, + "learning_rate": 0.0002, + "loss": 1.2304, + "step": 7150 + }, + { + "epoch": 5.349271572655958, + "grad_norm": 0.9323700070381165, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 7160 + }, + { + "epoch": 5.356742622338438, + "grad_norm": 1.0037827491760254, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 7170 + }, + { + "epoch": 5.364213672020919, + "grad_norm": 0.8746469616889954, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 7180 + }, + { + "epoch": 5.371684721703399, + "grad_norm": 0.9516328573226929, + "learning_rate": 0.0002, + "loss": 1.2759, + "step": 7190 + }, + { + "epoch": 5.3791557713858795, + "grad_norm": 0.9395177364349365, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 7200 + }, + { + "epoch": 5.38662682106836, + "grad_norm": 1.000369906425476, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 7210 + }, + { + "epoch": 5.394097870750841, + "grad_norm": 1.0845502614974976, + "learning_rate": 0.0002, + "loss": 1.2337, + "step": 7220 + }, + { + "epoch": 5.401568920433321, + "grad_norm": 0.8975145220756531, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 7230 + }, + { + "epoch": 5.409039970115801, + "grad_norm": 1.040077805519104, + "learning_rate": 0.0002, + "loss": 1.2306, + "step": 7240 + }, + { + "epoch": 5.416511019798282, + "grad_norm": 1.0729942321777344, + "learning_rate": 0.0002, + "loss": 1.2277, + "step": 7250 + }, + { + "epoch": 5.423982069480762, + "grad_norm": 0.8322232961654663, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 7260 + }, + { + "epoch": 5.431453119163242, + "grad_norm": 1.0654641389846802, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 7270 + }, + { + "epoch": 5.438924168845723, + "grad_norm": 1.0445852279663086, + "learning_rate": 0.0002, + "loss": 1.268, + "step": 7280 + }, + { + "epoch": 5.446395218528203, + "grad_norm": 1.0762956142425537, + "learning_rate": 0.0002, + "loss": 1.2743, + "step": 7290 + }, + { + "epoch": 5.4538662682106835, + "grad_norm": 0.9721953868865967, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 7300 + }, + { + "epoch": 5.461337317893164, + "grad_norm": 0.9238539338111877, + "learning_rate": 0.0002, + "loss": 1.2833, + "step": 7310 + }, + { + "epoch": 5.468808367575645, + "grad_norm": 0.9912874102592468, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 7320 + }, + { + "epoch": 5.476279417258125, + "grad_norm": 1.0727077722549438, + "learning_rate": 0.0002, + "loss": 1.2557, + "step": 7330 + }, + { + "epoch": 5.483750466940605, + "grad_norm": 0.8633865118026733, + "learning_rate": 0.0002, + "loss": 1.3471, + "step": 7340 + }, + { + "epoch": 5.491221516623085, + "grad_norm": 0.9396262764930725, + "learning_rate": 0.0002, + "loss": 1.3155, + "step": 7350 + }, + { + "epoch": 5.498692566305566, + "grad_norm": 1.0253715515136719, + "learning_rate": 0.0002, + "loss": 1.3146, + "step": 7360 + }, + { + "epoch": 5.506163615988046, + "grad_norm": 1.006047010421753, + "learning_rate": 0.0002, + "loss": 1.3156, + "step": 7370 + }, + { + "epoch": 5.513634665670526, + "grad_norm": 0.9781233072280884, + "learning_rate": 0.0002, + "loss": 1.3107, + "step": 7380 + }, + { + "epoch": 5.521105715353007, + "grad_norm": 0.9945126175880432, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 7390 + }, + { + "epoch": 5.528576765035488, + "grad_norm": 0.9081175327301025, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 7400 + }, + { + "epoch": 5.536047814717968, + "grad_norm": 1.2215938568115234, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 7410 + }, + { + "epoch": 5.543518864400449, + "grad_norm": 1.0724077224731445, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 7420 + }, + { + "epoch": 5.550989914082929, + "grad_norm": 1.106955885887146, + "learning_rate": 0.0002, + "loss": 1.3083, + "step": 7430 + }, + { + "epoch": 5.558460963765409, + "grad_norm": 1.0657650232315063, + "learning_rate": 0.0002, + "loss": 1.2125, + "step": 7440 + }, + { + "epoch": 5.565932013447889, + "grad_norm": 0.9725455641746521, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 7450 + }, + { + "epoch": 5.57340306313037, + "grad_norm": 0.8604224324226379, + "learning_rate": 0.0002, + "loss": 1.3297, + "step": 7460 + }, + { + "epoch": 5.58087411281285, + "grad_norm": 0.9913371205329895, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 7470 + }, + { + "epoch": 5.58834516249533, + "grad_norm": 1.012073040008545, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 7480 + }, + { + "epoch": 5.5958162121778106, + "grad_norm": 1.1003159284591675, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 7490 + }, + { + "epoch": 5.603287261860292, + "grad_norm": 0.9104593992233276, + "learning_rate": 0.0002, + "loss": 1.2577, + "step": 7500 + }, + { + "epoch": 5.610758311542772, + "grad_norm": 0.9480831623077393, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 7510 + }, + { + "epoch": 5.618229361225252, + "grad_norm": 1.0826456546783447, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 7520 + }, + { + "epoch": 5.625700410907733, + "grad_norm": 0.8286259174346924, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 7530 + }, + { + "epoch": 5.633171460590213, + "grad_norm": 0.9145061373710632, + "learning_rate": 0.0002, + "loss": 1.2918, + "step": 7540 + }, + { + "epoch": 5.640642510272693, + "grad_norm": 0.9363601803779602, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 7550 + }, + { + "epoch": 5.648113559955174, + "grad_norm": 0.9553244709968567, + "learning_rate": 0.0002, + "loss": 1.2265, + "step": 7560 + }, + { + "epoch": 5.655584609637654, + "grad_norm": 1.0343557596206665, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 7570 + }, + { + "epoch": 5.663055659320134, + "grad_norm": 0.8734238743782043, + "learning_rate": 0.0002, + "loss": 1.3171, + "step": 7580 + }, + { + "epoch": 5.670526709002615, + "grad_norm": 1.0230586528778076, + "learning_rate": 0.0002, + "loss": 1.2785, + "step": 7590 + }, + { + "epoch": 5.677997758685096, + "grad_norm": 1.0063409805297852, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 7600 + }, + { + "epoch": 5.685468808367576, + "grad_norm": 1.0104626417160034, + "learning_rate": 0.0002, + "loss": 1.2396, + "step": 7610 + }, + { + "epoch": 5.692939858050056, + "grad_norm": 0.9528168439865112, + "learning_rate": 0.0002, + "loss": 1.2581, + "step": 7620 + }, + { + "epoch": 5.700410907732536, + "grad_norm": 0.9799878597259521, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 7630 + }, + { + "epoch": 5.707881957415017, + "grad_norm": 0.969351589679718, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 7640 + }, + { + "epoch": 5.715353007097497, + "grad_norm": 1.3037652969360352, + "learning_rate": 0.0002, + "loss": 1.3055, + "step": 7650 + }, + { + "epoch": 5.722824056779977, + "grad_norm": 1.0640486478805542, + "learning_rate": 0.0002, + "loss": 1.3126, + "step": 7660 + }, + { + "epoch": 5.730295106462458, + "grad_norm": 1.0416420698165894, + "learning_rate": 0.0002, + "loss": 1.3325, + "step": 7670 + }, + { + "epoch": 5.7377661561449385, + "grad_norm": 0.8893619775772095, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 7680 + }, + { + "epoch": 5.745237205827419, + "grad_norm": 0.8512844443321228, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 7690 + }, + { + "epoch": 5.7527082555099, + "grad_norm": 0.9955748319625854, + "learning_rate": 0.0002, + "loss": 1.3328, + "step": 7700 + }, + { + "epoch": 5.76017930519238, + "grad_norm": 1.0409910678863525, + "learning_rate": 0.0002, + "loss": 1.294, + "step": 7710 + }, + { + "epoch": 5.76765035487486, + "grad_norm": 1.010097861289978, + "learning_rate": 0.0002, + "loss": 1.3518, + "step": 7720 + }, + { + "epoch": 5.77512140455734, + "grad_norm": 0.8974892497062683, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 7730 + }, + { + "epoch": 5.782592454239821, + "grad_norm": 0.972835123538971, + "learning_rate": 0.0002, + "loss": 1.2743, + "step": 7740 + }, + { + "epoch": 5.790063503922301, + "grad_norm": 0.9607440829277039, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 7750 + }, + { + "epoch": 5.797534553604781, + "grad_norm": 0.9426500797271729, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 7760 + }, + { + "epoch": 5.8050056032872615, + "grad_norm": 0.8745320439338684, + "learning_rate": 0.0002, + "loss": 1.274, + "step": 7770 + }, + { + "epoch": 5.8124766529697425, + "grad_norm": 1.0117204189300537, + "learning_rate": 0.0002, + "loss": 1.3009, + "step": 7780 + }, + { + "epoch": 5.819947702652223, + "grad_norm": 1.0387755632400513, + "learning_rate": 0.0002, + "loss": 1.3135, + "step": 7790 + }, + { + "epoch": 5.827418752334703, + "grad_norm": 1.0709784030914307, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 7800 + }, + { + "epoch": 5.834889802017184, + "grad_norm": 0.9512667655944824, + "learning_rate": 0.0002, + "loss": 1.225, + "step": 7810 + }, + { + "epoch": 5.842360851699664, + "grad_norm": 1.021094560623169, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 7820 + }, + { + "epoch": 5.849831901382144, + "grad_norm": 1.117491364479065, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 7830 + }, + { + "epoch": 5.857302951064625, + "grad_norm": 0.9252554178237915, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 7840 + }, + { + "epoch": 5.864774000747105, + "grad_norm": 1.1416207551956177, + "learning_rate": 0.0002, + "loss": 1.2976, + "step": 7850 + }, + { + "epoch": 5.872245050429585, + "grad_norm": 1.1219907999038696, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 7860 + }, + { + "epoch": 5.8797161001120655, + "grad_norm": 0.8300467729568481, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 7870 + }, + { + "epoch": 5.8871871497945465, + "grad_norm": 1.00551438331604, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 7880 + }, + { + "epoch": 5.894658199477027, + "grad_norm": 0.8981153964996338, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 7890 + }, + { + "epoch": 5.902129249159507, + "grad_norm": 1.0247976779937744, + "learning_rate": 0.0002, + "loss": 1.2817, + "step": 7900 + }, + { + "epoch": 5.909600298841987, + "grad_norm": 1.0820319652557373, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 7910 + }, + { + "epoch": 5.917071348524468, + "grad_norm": 0.952675461769104, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 7920 + }, + { + "epoch": 5.924542398206948, + "grad_norm": 0.8666740655899048, + "learning_rate": 0.0002, + "loss": 1.307, + "step": 7930 + }, + { + "epoch": 5.932013447889428, + "grad_norm": 0.8640421032905579, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 7940 + }, + { + "epoch": 5.939484497571909, + "grad_norm": 1.2343276739120483, + "learning_rate": 0.0002, + "loss": 1.2386, + "step": 7950 + }, + { + "epoch": 5.946955547254389, + "grad_norm": 0.958046555519104, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 7960 + }, + { + "epoch": 5.9544265969368695, + "grad_norm": 1.0538510084152222, + "learning_rate": 0.0002, + "loss": 1.2352, + "step": 7970 + }, + { + "epoch": 5.9618976466193505, + "grad_norm": 1.2681571245193481, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 7980 + }, + { + "epoch": 5.969368696301831, + "grad_norm": 0.8171183466911316, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 7990 + }, + { + "epoch": 5.976839745984311, + "grad_norm": 0.9109523892402649, + "learning_rate": 0.0002, + "loss": 1.3412, + "step": 8000 + }, + { + "epoch": 5.984310795666791, + "grad_norm": 1.0040639638900757, + "learning_rate": 0.0002, + "loss": 1.3497, + "step": 8010 + }, + { + "epoch": 5.991781845349272, + "grad_norm": 0.9596554040908813, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 8020 + }, + { + "epoch": 5.999252895031752, + "grad_norm": 0.9782963991165161, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 8030 + }, + { + "epoch": 6.0, + "eval_loss": 2.0417845249176025, + "eval_runtime": 38.8465, + "eval_samples_per_second": 13.257, + "eval_steps_per_second": 1.673, + "step": 8031 + } + ], + "logging_steps": 10, + "max_steps": 10704, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.716566978626847e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-8031/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5accf51f7e52c9bee3ed368a11822fe2a33af2c0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426bc36655480ed54250101ac6f4186df4dc96d04f6982887be431fc47bdad37 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..efc57b90381ca916655b2cd9864166283f838984 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72e12464e1a6b9f8107b802c37b1dc5216408641b8e1fbe26005f72261a45e9e +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..72ca13584784595405f5a7f35258a0b2246b4293 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2040d668c6e113af8075201ef00445e6e60836ece991e36e11c7cba911d1a2e9 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a03b1d1b1241d394e4954b9fcd6b8c4f978efd7e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e89ee177280e72c7dd395ddce5b22981f15c113da83fc9225f2ba2292930a05 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..82a9934215d2273dccdea055783314001f0ff7b6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/trainer_state.json @@ -0,0 +1,6641 @@ +{ + "best_metric": 1.8046749830245972, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", + "epoch": 6.999626447515876, + "eval_steps": 10, + "global_step": 9369, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007471049682480389, + "grad_norm": 0.4912872612476349, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 10 + }, + { + "epoch": 0.014942099364960777, + "grad_norm": 0.4856316149234772, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 20 + }, + { + "epoch": 0.022413149047441166, + "grad_norm": 0.47683125734329224, + "learning_rate": 0.0002, + "loss": 2.0957, + "step": 30 + }, + { + "epoch": 0.029884198729921554, + "grad_norm": 0.515082597732544, + "learning_rate": 0.0002, + "loss": 1.8908, + "step": 40 + }, + { + "epoch": 0.03735524841240194, + "grad_norm": 0.5299215316772461, + "learning_rate": 0.0002, + "loss": 1.9704, + "step": 50 + }, + { + "epoch": 0.04482629809488233, + "grad_norm": 0.4951399862766266, + "learning_rate": 0.0002, + "loss": 1.9225, + "step": 60 + }, + { + "epoch": 0.05229734777736272, + "grad_norm": 0.48079821467399597, + "learning_rate": 0.0002, + "loss": 1.9742, + "step": 70 + }, + { + "epoch": 0.05976839745984311, + "grad_norm": 0.49402132630348206, + "learning_rate": 0.0002, + "loss": 1.9466, + "step": 80 + }, + { + "epoch": 0.0672394471423235, + "grad_norm": 0.4778193235397339, + "learning_rate": 0.0002, + "loss": 1.8691, + "step": 90 + }, + { + "epoch": 0.07471049682480388, + "grad_norm": 0.42472657561302185, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 100 + }, + { + "epoch": 0.08218154650728428, + "grad_norm": 0.4433092474937439, + "learning_rate": 0.0002, + "loss": 1.8744, + "step": 110 + }, + { + "epoch": 0.08965259618976466, + "grad_norm": 0.4472862780094147, + "learning_rate": 0.0002, + "loss": 1.865, + "step": 120 + }, + { + "epoch": 0.09712364587224505, + "grad_norm": 0.42596298456192017, + "learning_rate": 0.0002, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.10459469555472543, + "grad_norm": 0.46645811200141907, + "learning_rate": 0.0002, + "loss": 1.8015, + "step": 140 + }, + { + "epoch": 0.11206574523720583, + "grad_norm": 0.41041234135627747, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 150 + }, + { + "epoch": 0.11953679491968622, + "grad_norm": 0.5329819917678833, + "learning_rate": 0.0002, + "loss": 1.8276, + "step": 160 + }, + { + "epoch": 0.1270078446021666, + "grad_norm": 0.4065922200679779, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 170 + }, + { + "epoch": 0.134478894284647, + "grad_norm": 0.38406994938850403, + "learning_rate": 0.0002, + "loss": 1.8559, + "step": 180 + }, + { + "epoch": 0.14194994396712737, + "grad_norm": 0.4246881306171417, + "learning_rate": 0.0002, + "loss": 1.8647, + "step": 190 + }, + { + "epoch": 0.14942099364960776, + "grad_norm": 0.35136649012565613, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 200 + }, + { + "epoch": 0.15689204333208817, + "grad_norm": 0.43252742290496826, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.16436309301456856, + "grad_norm": 0.39236941933631897, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 220 + }, + { + "epoch": 0.17183414269704894, + "grad_norm": 0.3748249113559723, + "learning_rate": 0.0002, + "loss": 1.818, + "step": 230 + }, + { + "epoch": 0.17930519237952933, + "grad_norm": 0.6432855725288391, + "learning_rate": 0.0002, + "loss": 1.866, + "step": 240 + }, + { + "epoch": 0.1867762420620097, + "grad_norm": 0.34874802827835083, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 250 + }, + { + "epoch": 0.1942472917444901, + "grad_norm": 0.3721984326839447, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 260 + }, + { + "epoch": 0.20171834142697048, + "grad_norm": 0.4339311420917511, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 270 + }, + { + "epoch": 0.20918939110945087, + "grad_norm": 0.4018215537071228, + "learning_rate": 0.0002, + "loss": 1.8665, + "step": 280 + }, + { + "epoch": 0.21666044079193125, + "grad_norm": 0.3278839886188507, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 290 + }, + { + "epoch": 0.22413149047441167, + "grad_norm": 0.36146077513694763, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 300 + }, + { + "epoch": 0.23160254015689205, + "grad_norm": 0.38175010681152344, + "learning_rate": 0.0002, + "loss": 1.7916, + "step": 310 + }, + { + "epoch": 0.23907358983937244, + "grad_norm": 0.44776618480682373, + "learning_rate": 0.0002, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.24654463952185282, + "grad_norm": 0.3933652937412262, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 330 + }, + { + "epoch": 0.2540156892043332, + "grad_norm": 0.3515005111694336, + "learning_rate": 0.0002, + "loss": 1.8393, + "step": 340 + }, + { + "epoch": 0.2614867388868136, + "grad_norm": 0.6683304309844971, + "learning_rate": 0.0002, + "loss": 1.8653, + "step": 350 + }, + { + "epoch": 0.268957788569294, + "grad_norm": 0.37093454599380493, + "learning_rate": 0.0002, + "loss": 1.8797, + "step": 360 + }, + { + "epoch": 0.2764288382517744, + "grad_norm": 0.3450651168823242, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 370 + }, + { + "epoch": 0.28389988793425475, + "grad_norm": 0.5140917301177979, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 380 + }, + { + "epoch": 0.29137093761673516, + "grad_norm": 0.32885563373565674, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 390 + }, + { + "epoch": 0.2988419872992155, + "grad_norm": 0.33962297439575195, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 400 + }, + { + "epoch": 0.30631303698169593, + "grad_norm": 0.3723141849040985, + "learning_rate": 0.0002, + "loss": 1.7467, + "step": 410 + }, + { + "epoch": 0.31378408666417634, + "grad_norm": 0.37173134088516235, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 420 + }, + { + "epoch": 0.3212551363466567, + "grad_norm": 0.33736956119537354, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 430 + }, + { + "epoch": 0.3287261860291371, + "grad_norm": 0.3602448105812073, + "learning_rate": 0.0002, + "loss": 1.8367, + "step": 440 + }, + { + "epoch": 0.33619723571161747, + "grad_norm": 0.3569699227809906, + "learning_rate": 0.0002, + "loss": 1.8058, + "step": 450 + }, + { + "epoch": 0.3436682853940979, + "grad_norm": 0.31009167432785034, + "learning_rate": 0.0002, + "loss": 1.8086, + "step": 460 + }, + { + "epoch": 0.35113933507657824, + "grad_norm": 0.5278693437576294, + "learning_rate": 0.0002, + "loss": 1.8876, + "step": 470 + }, + { + "epoch": 0.35861038475905865, + "grad_norm": 0.3587537109851837, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 480 + }, + { + "epoch": 0.366081434441539, + "grad_norm": 0.3859670162200928, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 490 + }, + { + "epoch": 0.3735524841240194, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 500 + }, + { + "epoch": 0.38102353380649984, + "grad_norm": 0.35052940249443054, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 510 + }, + { + "epoch": 0.3884945834889802, + "grad_norm": 0.2979494333267212, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 520 + }, + { + "epoch": 0.3959656331714606, + "grad_norm": 0.3062683343887329, + "learning_rate": 0.0002, + "loss": 1.8641, + "step": 530 + }, + { + "epoch": 0.40343668285394096, + "grad_norm": 0.3172847330570221, + "learning_rate": 0.0002, + "loss": 1.7651, + "step": 540 + }, + { + "epoch": 0.4109077325364214, + "grad_norm": 0.360435426235199, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 550 + }, + { + "epoch": 0.41837878221890173, + "grad_norm": 0.3427872359752655, + "learning_rate": 0.0002, + "loss": 1.9054, + "step": 560 + }, + { + "epoch": 0.42584983190138215, + "grad_norm": 0.34036558866500854, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 570 + }, + { + "epoch": 0.4333208815838625, + "grad_norm": 0.3365345299243927, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 580 + }, + { + "epoch": 0.4407919312663429, + "grad_norm": 0.35619041323661804, + "learning_rate": 0.0002, + "loss": 1.8328, + "step": 590 + }, + { + "epoch": 0.44826298094882333, + "grad_norm": 0.3569088280200958, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 600 + }, + { + "epoch": 0.4557340306313037, + "grad_norm": 0.3581278622150421, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 610 + }, + { + "epoch": 0.4632050803137841, + "grad_norm": 0.43197110295295715, + "learning_rate": 0.0002, + "loss": 1.7078, + "step": 620 + }, + { + "epoch": 0.47067612999626446, + "grad_norm": 0.33966198563575745, + "learning_rate": 0.0002, + "loss": 1.8257, + "step": 630 + }, + { + "epoch": 0.47814717967874487, + "grad_norm": 0.3343866467475891, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 640 + }, + { + "epoch": 0.48561822936122523, + "grad_norm": 0.33878564834594727, + "learning_rate": 0.0002, + "loss": 1.8191, + "step": 650 + }, + { + "epoch": 0.49308927904370564, + "grad_norm": 0.387195885181427, + "learning_rate": 0.0002, + "loss": 1.8801, + "step": 660 + }, + { + "epoch": 0.500560328726186, + "grad_norm": 0.3755440413951874, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 670 + }, + { + "epoch": 0.5080313784086664, + "grad_norm": 0.3272816836833954, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 680 + }, + { + "epoch": 0.5155024280911468, + "grad_norm": 0.36063864827156067, + "learning_rate": 0.0002, + "loss": 1.8156, + "step": 690 + }, + { + "epoch": 0.5229734777736272, + "grad_norm": 0.35317373275756836, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 700 + }, + { + "epoch": 0.5304445274561076, + "grad_norm": 0.3561195433139801, + "learning_rate": 0.0002, + "loss": 1.7603, + "step": 710 + }, + { + "epoch": 0.537915577138588, + "grad_norm": 0.31124624609947205, + "learning_rate": 0.0002, + "loss": 1.8149, + "step": 720 + }, + { + "epoch": 0.5453866268210683, + "grad_norm": 0.3294544517993927, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 730 + }, + { + "epoch": 0.5528576765035488, + "grad_norm": 0.31933900713920593, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 740 + }, + { + "epoch": 0.5603287261860291, + "grad_norm": 0.3226020634174347, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 750 + }, + { + "epoch": 0.5677997758685095, + "grad_norm": 0.3147525489330292, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 760 + }, + { + "epoch": 0.57527082555099, + "grad_norm": 0.32234328985214233, + "learning_rate": 0.0002, + "loss": 1.9028, + "step": 770 + }, + { + "epoch": 0.5827418752334703, + "grad_norm": 0.3258664309978485, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 780 + }, + { + "epoch": 0.5902129249159507, + "grad_norm": 0.3166961967945099, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 790 + }, + { + "epoch": 0.597683974598431, + "grad_norm": 0.35621458292007446, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 800 + }, + { + "epoch": 0.6051550242809115, + "grad_norm": 0.3236999213695526, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 810 + }, + { + "epoch": 0.6126260739633919, + "grad_norm": 0.2892923653125763, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 820 + }, + { + "epoch": 0.6200971236458722, + "grad_norm": 0.4098321497440338, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 830 + }, + { + "epoch": 0.6275681733283527, + "grad_norm": 0.3337118923664093, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 840 + }, + { + "epoch": 0.635039223010833, + "grad_norm": 0.30416029691696167, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 850 + }, + { + "epoch": 0.6425102726933134, + "grad_norm": 0.3361026346683502, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 860 + }, + { + "epoch": 0.6499813223757938, + "grad_norm": 0.3537365198135376, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 870 + }, + { + "epoch": 0.6574523720582742, + "grad_norm": 0.33854469656944275, + "learning_rate": 0.0002, + "loss": 1.7825, + "step": 880 + }, + { + "epoch": 0.6649234217407546, + "grad_norm": 0.3332272469997406, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 890 + }, + { + "epoch": 0.6723944714232349, + "grad_norm": 0.34954726696014404, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 900 + }, + { + "epoch": 0.6798655211057153, + "grad_norm": 0.2921750247478485, + "learning_rate": 0.0002, + "loss": 1.7917, + "step": 910 + }, + { + "epoch": 0.6873365707881958, + "grad_norm": 0.30508682131767273, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 920 + }, + { + "epoch": 0.6948076204706761, + "grad_norm": 0.32268425822257996, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 930 + }, + { + "epoch": 0.7022786701531565, + "grad_norm": 0.2844390869140625, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 940 + }, + { + "epoch": 0.709749719835637, + "grad_norm": 0.31263890862464905, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 950 + }, + { + "epoch": 0.7172207695181173, + "grad_norm": 0.3626808822154999, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 960 + }, + { + "epoch": 0.7246918192005977, + "grad_norm": 0.3322749733924866, + "learning_rate": 0.0002, + "loss": 1.853, + "step": 970 + }, + { + "epoch": 0.732162868883078, + "grad_norm": 0.29177871346473694, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 980 + }, + { + "epoch": 0.7396339185655585, + "grad_norm": 0.35405513644218445, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 990 + }, + { + "epoch": 0.7471049682480388, + "grad_norm": 0.39318400621414185, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 1000 + }, + { + "epoch": 0.7545760179305192, + "grad_norm": 0.29401418566703796, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1010 + }, + { + "epoch": 0.7620470676129997, + "grad_norm": 0.3271748721599579, + "learning_rate": 0.0002, + "loss": 1.7649, + "step": 1020 + }, + { + "epoch": 0.76951811729548, + "grad_norm": 0.30883970856666565, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1030 + }, + { + "epoch": 0.7769891669779604, + "grad_norm": 0.3411838412284851, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 1040 + }, + { + "epoch": 0.7844602166604407, + "grad_norm": 0.30608129501342773, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 1050 + }, + { + "epoch": 0.7919312663429212, + "grad_norm": 0.30899080634117126, + "learning_rate": 0.0002, + "loss": 1.7815, + "step": 1060 + }, + { + "epoch": 0.7994023160254016, + "grad_norm": 0.3160453140735626, + "learning_rate": 0.0002, + "loss": 1.7625, + "step": 1070 + }, + { + "epoch": 0.8068733657078819, + "grad_norm": 0.30947187542915344, + "learning_rate": 0.0002, + "loss": 1.8452, + "step": 1080 + }, + { + "epoch": 0.8143444153903624, + "grad_norm": 0.3103134036064148, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1090 + }, + { + "epoch": 0.8218154650728428, + "grad_norm": 0.31771138310432434, + "learning_rate": 0.0002, + "loss": 1.842, + "step": 1100 + }, + { + "epoch": 0.8292865147553231, + "grad_norm": 0.5860997438430786, + "learning_rate": 0.0002, + "loss": 1.7918, + "step": 1110 + }, + { + "epoch": 0.8367575644378035, + "grad_norm": 0.3230148255825043, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 1120 + }, + { + "epoch": 0.8442286141202839, + "grad_norm": 0.29611510038375854, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 1130 + }, + { + "epoch": 0.8516996638027643, + "grad_norm": 0.3373654782772064, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 1140 + }, + { + "epoch": 0.8591707134852447, + "grad_norm": 0.3474279046058655, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1150 + }, + { + "epoch": 0.866641763167725, + "grad_norm": 0.35057875514030457, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1160 + }, + { + "epoch": 0.8741128128502055, + "grad_norm": 0.39537495374679565, + "learning_rate": 0.0002, + "loss": 1.8273, + "step": 1170 + }, + { + "epoch": 0.8815838625326858, + "grad_norm": 0.3714233636856079, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1180 + }, + { + "epoch": 0.8890549122151662, + "grad_norm": 0.2950296998023987, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 1190 + }, + { + "epoch": 0.8965259618976467, + "grad_norm": 0.38182979822158813, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 1200 + }, + { + "epoch": 0.903997011580127, + "grad_norm": 0.27883678674697876, + "learning_rate": 0.0002, + "loss": 1.827, + "step": 1210 + }, + { + "epoch": 0.9114680612626074, + "grad_norm": 0.33874374628067017, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 1220 + }, + { + "epoch": 0.9189391109450877, + "grad_norm": 0.3014272153377533, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1230 + }, + { + "epoch": 0.9264101606275682, + "grad_norm": 0.3194271922111511, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 1240 + }, + { + "epoch": 0.9338812103100486, + "grad_norm": 0.3049403429031372, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1250 + }, + { + "epoch": 0.9413522599925289, + "grad_norm": 0.30621254444122314, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 1260 + }, + { + "epoch": 0.9488233096750094, + "grad_norm": 0.28675132989883423, + "learning_rate": 0.0002, + "loss": 1.8287, + "step": 1270 + }, + { + "epoch": 0.9562943593574897, + "grad_norm": 0.3322032690048218, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1280 + }, + { + "epoch": 0.9637654090399701, + "grad_norm": 0.35408294200897217, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1290 + }, + { + "epoch": 0.9712364587224505, + "grad_norm": 0.36386919021606445, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1300 + }, + { + "epoch": 0.9787075084049309, + "grad_norm": 0.32338324189186096, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 1310 + }, + { + "epoch": 0.9861785580874113, + "grad_norm": 0.3714013993740082, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 1320 + }, + { + "epoch": 0.9936496077698916, + "grad_norm": 0.3133082389831543, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 1330 + }, + { + "epoch": 0.9996264475158759, + "eval_loss": 1.8051470518112183, + "eval_runtime": 38.6332, + "eval_samples_per_second": 13.331, + "eval_steps_per_second": 1.682, + "step": 1338 + }, + { + "epoch": 1.001120657452372, + "grad_norm": 0.31595754623413086, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 1340 + }, + { + "epoch": 1.0085917071348525, + "grad_norm": 0.3095700144767761, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1350 + }, + { + "epoch": 1.0160627568173328, + "grad_norm": 0.34677496552467346, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1360 + }, + { + "epoch": 1.0235338064998132, + "grad_norm": 0.29108840227127075, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1370 + }, + { + "epoch": 1.0310048561822935, + "grad_norm": 0.32356950640678406, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 1380 + }, + { + "epoch": 1.038475905864774, + "grad_norm": 0.4200669229030609, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1390 + }, + { + "epoch": 1.0459469555472545, + "grad_norm": 0.3283711373806, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 1400 + }, + { + "epoch": 1.0534180052297348, + "grad_norm": 0.32898256182670593, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1410 + }, + { + "epoch": 1.0608890549122152, + "grad_norm": 0.38790300488471985, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 1420 + }, + { + "epoch": 1.0683601045946955, + "grad_norm": 0.339800089597702, + "learning_rate": 0.0002, + "loss": 1.6922, + "step": 1430 + }, + { + "epoch": 1.075831154277176, + "grad_norm": 0.3548751175403595, + "learning_rate": 0.0002, + "loss": 1.7076, + "step": 1440 + }, + { + "epoch": 1.0833022039596563, + "grad_norm": 0.35114359855651855, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1450 + }, + { + "epoch": 1.0907732536421366, + "grad_norm": 0.35226720571517944, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 1460 + }, + { + "epoch": 1.0982443033246172, + "grad_norm": 0.33665576577186584, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 1470 + }, + { + "epoch": 1.1057153530070976, + "grad_norm": 0.363889217376709, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1480 + }, + { + "epoch": 1.113186402689578, + "grad_norm": 0.3826201856136322, + "learning_rate": 0.0002, + "loss": 1.7933, + "step": 1490 + }, + { + "epoch": 1.1206574523720583, + "grad_norm": 0.34058740735054016, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 1500 + }, + { + "epoch": 1.1281285020545386, + "grad_norm": 0.3462134301662445, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1510 + }, + { + "epoch": 1.135599551737019, + "grad_norm": 0.3396756052970886, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 1520 + }, + { + "epoch": 1.1430706014194993, + "grad_norm": 0.32004743814468384, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1530 + }, + { + "epoch": 1.15054165110198, + "grad_norm": 0.3397733271121979, + "learning_rate": 0.0002, + "loss": 1.743, + "step": 1540 + }, + { + "epoch": 1.1580127007844603, + "grad_norm": 0.3783262073993683, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 1550 + }, + { + "epoch": 1.1654837504669406, + "grad_norm": 0.35121291875839233, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1560 + }, + { + "epoch": 1.172954800149421, + "grad_norm": 0.35816895961761475, + "learning_rate": 0.0002, + "loss": 1.678, + "step": 1570 + }, + { + "epoch": 1.1804258498319014, + "grad_norm": 0.33843839168548584, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1580 + }, + { + "epoch": 1.1878968995143817, + "grad_norm": 0.3371972143650055, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 1590 + }, + { + "epoch": 1.195367949196862, + "grad_norm": 0.36016878485679626, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 1600 + }, + { + "epoch": 1.2028389988793426, + "grad_norm": 0.40879473090171814, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 1610 + }, + { + "epoch": 1.210310048561823, + "grad_norm": 0.3216715455055237, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 1620 + }, + { + "epoch": 1.2177810982443034, + "grad_norm": 0.4482610821723938, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1630 + }, + { + "epoch": 1.2252521479267837, + "grad_norm": 0.3257700502872467, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 1640 + }, + { + "epoch": 1.232723197609264, + "grad_norm": 0.38646459579467773, + "learning_rate": 0.0002, + "loss": 1.7177, + "step": 1650 + }, + { + "epoch": 1.2401942472917444, + "grad_norm": 0.4081360697746277, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1660 + }, + { + "epoch": 1.2476652969742248, + "grad_norm": 0.4326848089694977, + "learning_rate": 0.0002, + "loss": 1.7519, + "step": 1670 + }, + { + "epoch": 1.2551363466567054, + "grad_norm": 0.346401572227478, + "learning_rate": 0.0002, + "loss": 1.6752, + "step": 1680 + }, + { + "epoch": 1.2626073963391857, + "grad_norm": 0.34536251425743103, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1690 + }, + { + "epoch": 1.270078446021666, + "grad_norm": 0.41359591484069824, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 1700 + }, + { + "epoch": 1.2775494957041464, + "grad_norm": 0.3530874252319336, + "learning_rate": 0.0002, + "loss": 1.7906, + "step": 1710 + }, + { + "epoch": 1.2850205453866268, + "grad_norm": 0.3702719211578369, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 1720 + }, + { + "epoch": 1.2924915950691072, + "grad_norm": 0.3703329563140869, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 1730 + }, + { + "epoch": 1.2999626447515875, + "grad_norm": 0.37919729948043823, + "learning_rate": 0.0002, + "loss": 1.7221, + "step": 1740 + }, + { + "epoch": 1.307433694434068, + "grad_norm": 0.32526856660842896, + "learning_rate": 0.0002, + "loss": 1.7859, + "step": 1750 + }, + { + "epoch": 1.3149047441165485, + "grad_norm": 0.36752620339393616, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1760 + }, + { + "epoch": 1.3223757937990288, + "grad_norm": 0.3398192524909973, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1770 + }, + { + "epoch": 1.3298468434815092, + "grad_norm": 0.37435585260391235, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1780 + }, + { + "epoch": 1.3373178931639895, + "grad_norm": 0.35793280601501465, + "learning_rate": 0.0002, + "loss": 1.7393, + "step": 1790 + }, + { + "epoch": 1.3447889428464699, + "grad_norm": 0.35481882095336914, + "learning_rate": 0.0002, + "loss": 1.7266, + "step": 1800 + }, + { + "epoch": 1.3522599925289502, + "grad_norm": 0.3786393105983734, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1810 + }, + { + "epoch": 1.3597310422114308, + "grad_norm": 0.33245593309402466, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1820 + }, + { + "epoch": 1.3672020918939112, + "grad_norm": 0.35388344526290894, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1830 + }, + { + "epoch": 1.3746731415763915, + "grad_norm": 0.3695325553417206, + "learning_rate": 0.0002, + "loss": 1.6968, + "step": 1840 + }, + { + "epoch": 1.382144191258872, + "grad_norm": 0.3683604598045349, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 1850 + }, + { + "epoch": 1.3896152409413522, + "grad_norm": 0.3753012418746948, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1860 + }, + { + "epoch": 1.3970862906238326, + "grad_norm": 0.3331069350242615, + "learning_rate": 0.0002, + "loss": 1.6969, + "step": 1870 + }, + { + "epoch": 1.404557340306313, + "grad_norm": 0.3877500295639038, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 1880 + }, + { + "epoch": 1.4120283899887935, + "grad_norm": 0.33525151014328003, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 1890 + }, + { + "epoch": 1.4194994396712737, + "grad_norm": 0.3697299659252167, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1900 + }, + { + "epoch": 1.4269704893537543, + "grad_norm": 0.4029286205768585, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 1910 + }, + { + "epoch": 1.4344415390362346, + "grad_norm": 0.3596203029155731, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 1920 + }, + { + "epoch": 1.441912588718715, + "grad_norm": 0.450783908367157, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 1930 + }, + { + "epoch": 1.4493836384011953, + "grad_norm": 0.3651481866836548, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1940 + }, + { + "epoch": 1.4568546880836757, + "grad_norm": 0.3608424663543701, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 1950 + }, + { + "epoch": 1.4643257377661563, + "grad_norm": 0.39684420824050903, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 1960 + }, + { + "epoch": 1.4717967874486364, + "grad_norm": 0.34618663787841797, + "learning_rate": 0.0002, + "loss": 1.7514, + "step": 1970 + }, + { + "epoch": 1.479267837131117, + "grad_norm": 0.4150386452674866, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1980 + }, + { + "epoch": 1.4867388868135973, + "grad_norm": 0.35500776767730713, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 1990 + }, + { + "epoch": 1.4942099364960777, + "grad_norm": 0.344144344329834, + "learning_rate": 0.0002, + "loss": 1.7322, + "step": 2000 + }, + { + "epoch": 1.501680986178558, + "grad_norm": 0.3340149223804474, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2010 + }, + { + "epoch": 1.5091520358610384, + "grad_norm": 0.37685006856918335, + "learning_rate": 0.0002, + "loss": 1.7508, + "step": 2020 + }, + { + "epoch": 1.516623085543519, + "grad_norm": 0.3699876368045807, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 2030 + }, + { + "epoch": 1.5240941352259991, + "grad_norm": 0.3370307385921478, + "learning_rate": 0.0002, + "loss": 1.7357, + "step": 2040 + }, + { + "epoch": 1.5315651849084797, + "grad_norm": 0.37780630588531494, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 2050 + }, + { + "epoch": 1.53903623459096, + "grad_norm": 0.370259165763855, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 2060 + }, + { + "epoch": 1.5465072842734404, + "grad_norm": 0.3440011441707611, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 2070 + }, + { + "epoch": 1.5539783339559208, + "grad_norm": 0.40382063388824463, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 2080 + }, + { + "epoch": 1.5614493836384011, + "grad_norm": 0.38002029061317444, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 2090 + }, + { + "epoch": 1.5689204333208817, + "grad_norm": 0.3658451437950134, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2100 + }, + { + "epoch": 1.5763914830033618, + "grad_norm": 0.354842871427536, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 2110 + }, + { + "epoch": 1.5838625326858424, + "grad_norm": 0.34735530614852905, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 2120 + }, + { + "epoch": 1.5913335823683228, + "grad_norm": 0.377581924200058, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 2130 + }, + { + "epoch": 1.5988046320508031, + "grad_norm": 0.41254034638404846, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 2140 + }, + { + "epoch": 1.6062756817332835, + "grad_norm": 0.3630715310573578, + "learning_rate": 0.0002, + "loss": 1.6782, + "step": 2150 + }, + { + "epoch": 1.6137467314157639, + "grad_norm": 0.36980143189430237, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 2160 + }, + { + "epoch": 1.6212177810982444, + "grad_norm": 0.3634769320487976, + "learning_rate": 0.0002, + "loss": 1.6847, + "step": 2170 + }, + { + "epoch": 1.6286888307807246, + "grad_norm": 0.3794139623641968, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2180 + }, + { + "epoch": 1.6361598804632052, + "grad_norm": 0.359742134809494, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 2190 + }, + { + "epoch": 1.6436309301456855, + "grad_norm": 0.3770543932914734, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.6511019798281659, + "grad_norm": 0.3797036409378052, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 2210 + }, + { + "epoch": 1.6585730295106462, + "grad_norm": 0.35622093081474304, + "learning_rate": 0.0002, + "loss": 1.7875, + "step": 2220 + }, + { + "epoch": 1.6660440791931266, + "grad_norm": 0.34552520513534546, + "learning_rate": 0.0002, + "loss": 1.6615, + "step": 2230 + }, + { + "epoch": 1.6735151288756072, + "grad_norm": 0.379926860332489, + "learning_rate": 0.0002, + "loss": 1.7522, + "step": 2240 + }, + { + "epoch": 1.6809861785580873, + "grad_norm": 0.37083810567855835, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 2250 + }, + { + "epoch": 1.6884572282405679, + "grad_norm": 0.42746543884277344, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 2260 + }, + { + "epoch": 1.6959282779230482, + "grad_norm": 0.3372884690761566, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2270 + }, + { + "epoch": 1.7033993276055286, + "grad_norm": 0.35220256447792053, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 2280 + }, + { + "epoch": 1.710870377288009, + "grad_norm": 0.3659130930900574, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 2290 + }, + { + "epoch": 1.7183414269704893, + "grad_norm": 0.37629297375679016, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2300 + }, + { + "epoch": 1.7258124766529699, + "grad_norm": 0.36312398314476013, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2310 + }, + { + "epoch": 1.73328352633545, + "grad_norm": 0.467709481716156, + "learning_rate": 0.0002, + "loss": 1.7903, + "step": 2320 + }, + { + "epoch": 1.7407545760179306, + "grad_norm": 0.38685527443885803, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2330 + }, + { + "epoch": 1.748225625700411, + "grad_norm": 0.3578338325023651, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 2340 + }, + { + "epoch": 1.7556966753828913, + "grad_norm": 0.36057502031326294, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2350 + }, + { + "epoch": 1.7631677250653717, + "grad_norm": 0.3615196645259857, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2360 + }, + { + "epoch": 1.770638774747852, + "grad_norm": 0.4118947684764862, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 2370 + }, + { + "epoch": 1.7781098244303326, + "grad_norm": 0.4067276120185852, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2380 + }, + { + "epoch": 1.7855808741128127, + "grad_norm": 0.3979823887348175, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2390 + }, + { + "epoch": 1.7930519237952933, + "grad_norm": 0.44045883417129517, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 2400 + }, + { + "epoch": 1.8005229734777737, + "grad_norm": 0.3998069167137146, + "learning_rate": 0.0002, + "loss": 1.7251, + "step": 2410 + }, + { + "epoch": 1.807994023160254, + "grad_norm": 0.3450094759464264, + "learning_rate": 0.0002, + "loss": 1.7354, + "step": 2420 + }, + { + "epoch": 1.8154650728427344, + "grad_norm": 0.3759009838104248, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 2430 + }, + { + "epoch": 1.8229361225252148, + "grad_norm": 0.34347015619277954, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2440 + }, + { + "epoch": 1.8304071722076953, + "grad_norm": 0.3511228859424591, + "learning_rate": 0.0002, + "loss": 1.7345, + "step": 2450 + }, + { + "epoch": 1.8378782218901755, + "grad_norm": 0.36853715777397156, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 2460 + }, + { + "epoch": 1.845349271572656, + "grad_norm": 0.40659376978874207, + "learning_rate": 0.0002, + "loss": 1.6931, + "step": 2470 + }, + { + "epoch": 1.8528203212551362, + "grad_norm": 0.39621320366859436, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 2480 + }, + { + "epoch": 1.8602913709376168, + "grad_norm": 0.3753979504108429, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 2490 + }, + { + "epoch": 1.8677624206200971, + "grad_norm": 0.3811938464641571, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2500 + }, + { + "epoch": 1.8752334703025775, + "grad_norm": 0.3432596027851105, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 2510 + }, + { + "epoch": 1.882704519985058, + "grad_norm": 0.3670712113380432, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 2520 + }, + { + "epoch": 1.8901755696675382, + "grad_norm": 0.40907177329063416, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2530 + }, + { + "epoch": 1.8976466193500188, + "grad_norm": 0.3821999728679657, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 2540 + }, + { + "epoch": 1.905117669032499, + "grad_norm": 0.36173978447914124, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 2550 + }, + { + "epoch": 1.9125887187149795, + "grad_norm": 0.38990336656570435, + "learning_rate": 0.0002, + "loss": 1.6939, + "step": 2560 + }, + { + "epoch": 1.9200597683974598, + "grad_norm": 0.35242322087287903, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 2570 + }, + { + "epoch": 1.9275308180799402, + "grad_norm": 0.3506428003311157, + "learning_rate": 0.0002, + "loss": 1.7268, + "step": 2580 + }, + { + "epoch": 1.9350018677624208, + "grad_norm": 0.39540135860443115, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2590 + }, + { + "epoch": 1.942472917444901, + "grad_norm": 0.3444725573062897, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2600 + }, + { + "epoch": 1.9499439671273815, + "grad_norm": 0.3963521718978882, + "learning_rate": 0.0002, + "loss": 1.7259, + "step": 2610 + }, + { + "epoch": 1.9574150168098616, + "grad_norm": 0.3689815402030945, + "learning_rate": 0.0002, + "loss": 1.6946, + "step": 2620 + }, + { + "epoch": 1.9648860664923422, + "grad_norm": 0.3482626676559448, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 2630 + }, + { + "epoch": 1.9723571161748226, + "grad_norm": 0.35832616686820984, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2640 + }, + { + "epoch": 1.979828165857303, + "grad_norm": 0.4776208996772766, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2650 + }, + { + "epoch": 1.9872992155397835, + "grad_norm": 0.32570165395736694, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2660 + }, + { + "epoch": 1.9947702652222636, + "grad_norm": 0.3380725085735321, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2670 + }, + { + "epoch": 2.0, + "eval_loss": 1.8046749830245972, + "eval_runtime": 38.5096, + "eval_samples_per_second": 13.373, + "eval_steps_per_second": 1.688, + "step": 2677 + }, + { + "epoch": 2.002241314904744, + "grad_norm": 0.36817631125450134, + "learning_rate": 0.0002, + "loss": 1.7265, + "step": 2680 + }, + { + "epoch": 2.0097123645872244, + "grad_norm": 0.4056456685066223, + "learning_rate": 0.0002, + "loss": 1.548, + "step": 2690 + }, + { + "epoch": 2.017183414269705, + "grad_norm": 0.37416863441467285, + "learning_rate": 0.0002, + "loss": 1.5515, + "step": 2700 + }, + { + "epoch": 2.024654463952185, + "grad_norm": 0.4273638427257538, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 2710 + }, + { + "epoch": 2.0321255136346656, + "grad_norm": 0.36497923731803894, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2720 + }, + { + "epoch": 2.0395965633171462, + "grad_norm": 0.5021994113922119, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 2730 + }, + { + "epoch": 2.0470676129996264, + "grad_norm": 0.45896220207214355, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 2740 + }, + { + "epoch": 2.054538662682107, + "grad_norm": 0.3973815143108368, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 2750 + }, + { + "epoch": 2.062009712364587, + "grad_norm": 0.4521815776824951, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2760 + }, + { + "epoch": 2.0694807620470677, + "grad_norm": 0.42775002121925354, + "learning_rate": 0.0002, + "loss": 1.6189, + "step": 2770 + }, + { + "epoch": 2.076951811729548, + "grad_norm": 0.48158586025238037, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 2780 + }, + { + "epoch": 2.0844228614120284, + "grad_norm": 0.4612371623516083, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 2790 + }, + { + "epoch": 2.091893911094509, + "grad_norm": 0.42536866664886475, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 2800 + }, + { + "epoch": 2.099364960776989, + "grad_norm": 0.48515772819519043, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 2810 + }, + { + "epoch": 2.1068360104594697, + "grad_norm": 0.41418662667274475, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2820 + }, + { + "epoch": 2.11430706014195, + "grad_norm": 0.4683697819709778, + "learning_rate": 0.0002, + "loss": 1.6266, + "step": 2830 + }, + { + "epoch": 2.1217781098244304, + "grad_norm": 0.4484657049179077, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2840 + }, + { + "epoch": 2.1292491595069105, + "grad_norm": 0.6621400713920593, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 2850 + }, + { + "epoch": 2.136720209189391, + "grad_norm": 0.45074811577796936, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 2860 + }, + { + "epoch": 2.1441912588718717, + "grad_norm": 0.3513113558292389, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2870 + }, + { + "epoch": 2.151662308554352, + "grad_norm": 0.40411314368247986, + "learning_rate": 0.0002, + "loss": 1.6081, + "step": 2880 + }, + { + "epoch": 2.1591333582368324, + "grad_norm": 0.4121065139770508, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 2890 + }, + { + "epoch": 2.1666044079193125, + "grad_norm": 0.44723689556121826, + "learning_rate": 0.0002, + "loss": 1.6324, + "step": 2900 + }, + { + "epoch": 2.174075457601793, + "grad_norm": 0.4226122498512268, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 2910 + }, + { + "epoch": 2.1815465072842732, + "grad_norm": 0.46617650985717773, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2920 + }, + { + "epoch": 2.189017556966754, + "grad_norm": 0.4506422281265259, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 2930 + }, + { + "epoch": 2.1964886066492344, + "grad_norm": 0.4892672896385193, + "learning_rate": 0.0002, + "loss": 1.6112, + "step": 2940 + }, + { + "epoch": 2.2039596563317145, + "grad_norm": 0.44095516204833984, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2950 + }, + { + "epoch": 2.211430706014195, + "grad_norm": 0.41522109508514404, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 2960 + }, + { + "epoch": 2.2189017556966752, + "grad_norm": 0.4860858917236328, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 2970 + }, + { + "epoch": 2.226372805379156, + "grad_norm": 0.42662516236305237, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2980 + }, + { + "epoch": 2.233843855061636, + "grad_norm": 0.4390648305416107, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 2990 + }, + { + "epoch": 2.2413149047441165, + "grad_norm": 0.47515565156936646, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 3000 + }, + { + "epoch": 2.248785954426597, + "grad_norm": 0.4104543924331665, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 3010 + }, + { + "epoch": 2.2562570041090773, + "grad_norm": 0.4404028654098511, + "learning_rate": 0.0002, + "loss": 1.6895, + "step": 3020 + }, + { + "epoch": 2.263728053791558, + "grad_norm": 0.4717366695404053, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3030 + }, + { + "epoch": 2.271199103474038, + "grad_norm": 0.48345857858657837, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 3040 + }, + { + "epoch": 2.2786701531565186, + "grad_norm": 0.5312452912330627, + "learning_rate": 0.0002, + "loss": 1.681, + "step": 3050 + }, + { + "epoch": 2.2861412028389987, + "grad_norm": 0.5073099732398987, + "learning_rate": 0.0002, + "loss": 1.5901, + "step": 3060 + }, + { + "epoch": 2.2936122525214793, + "grad_norm": 0.5027463436126709, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 3070 + }, + { + "epoch": 2.30108330220396, + "grad_norm": 0.5436304807662964, + "learning_rate": 0.0002, + "loss": 1.5862, + "step": 3080 + }, + { + "epoch": 2.30855435188644, + "grad_norm": 0.4701065123081207, + "learning_rate": 0.0002, + "loss": 1.5763, + "step": 3090 + }, + { + "epoch": 2.3160254015689206, + "grad_norm": 0.46988746523857117, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 3100 + }, + { + "epoch": 2.3234964512514007, + "grad_norm": 0.45112869143486023, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 3110 + }, + { + "epoch": 2.3309675009338813, + "grad_norm": 0.5173566937446594, + "learning_rate": 0.0002, + "loss": 1.6291, + "step": 3120 + }, + { + "epoch": 2.3384385506163614, + "grad_norm": 0.40345850586891174, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 3130 + }, + { + "epoch": 2.345909600298842, + "grad_norm": 0.4218924939632416, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 3140 + }, + { + "epoch": 2.3533806499813226, + "grad_norm": 0.41857317090034485, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 3150 + }, + { + "epoch": 2.3608516996638027, + "grad_norm": 0.4197218418121338, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 3160 + }, + { + "epoch": 2.3683227493462833, + "grad_norm": 0.4260677397251129, + "learning_rate": 0.0002, + "loss": 1.6572, + "step": 3170 + }, + { + "epoch": 2.3757937990287634, + "grad_norm": 0.4209042191505432, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3180 + }, + { + "epoch": 2.383264848711244, + "grad_norm": 0.4092234969139099, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3190 + }, + { + "epoch": 2.390735898393724, + "grad_norm": 0.4928431510925293, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 3200 + }, + { + "epoch": 2.3982069480762047, + "grad_norm": 0.49252402782440186, + "learning_rate": 0.0002, + "loss": 1.6015, + "step": 3210 + }, + { + "epoch": 2.4056779977586853, + "grad_norm": 0.4368397295475006, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3220 + }, + { + "epoch": 2.4131490474411654, + "grad_norm": 0.46122390031814575, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 3230 + }, + { + "epoch": 2.420620097123646, + "grad_norm": 0.4272301197052002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.428091146806126, + "grad_norm": 0.41480937600135803, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 3250 + }, + { + "epoch": 2.4355621964886067, + "grad_norm": 0.48911941051483154, + "learning_rate": 0.0002, + "loss": 1.6281, + "step": 3260 + }, + { + "epoch": 2.443033246171087, + "grad_norm": 0.4444098472595215, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 3270 + }, + { + "epoch": 2.4505042958535674, + "grad_norm": 0.5111684799194336, + "learning_rate": 0.0002, + "loss": 1.6961, + "step": 3280 + }, + { + "epoch": 2.457975345536048, + "grad_norm": 0.5058825016021729, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 3290 + }, + { + "epoch": 2.465446395218528, + "grad_norm": 0.44173210859298706, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3300 + }, + { + "epoch": 2.4729174449010087, + "grad_norm": 0.4659745991230011, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 3310 + }, + { + "epoch": 2.480388494583489, + "grad_norm": 0.47237497568130493, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3320 + }, + { + "epoch": 2.4878595442659694, + "grad_norm": 0.47303131222724915, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 3330 + }, + { + "epoch": 2.4953305939484496, + "grad_norm": 0.4522389769554138, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 3340 + }, + { + "epoch": 2.50280164363093, + "grad_norm": 0.4467332363128662, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3350 + }, + { + "epoch": 2.5102726933134107, + "grad_norm": 0.4413762092590332, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 3360 + }, + { + "epoch": 2.517743742995891, + "grad_norm": 0.495514452457428, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 3370 + }, + { + "epoch": 2.5252147926783715, + "grad_norm": 0.4429773986339569, + "learning_rate": 0.0002, + "loss": 1.5839, + "step": 3380 + }, + { + "epoch": 2.5326858423608516, + "grad_norm": 0.4589079022407532, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3390 + }, + { + "epoch": 2.540156892043332, + "grad_norm": 0.4683997333049774, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 3400 + }, + { + "epoch": 2.5476279417258123, + "grad_norm": 0.4651731252670288, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 3410 + }, + { + "epoch": 2.555098991408293, + "grad_norm": 0.45818084478378296, + "learning_rate": 0.0002, + "loss": 1.5918, + "step": 3420 + }, + { + "epoch": 2.5625700410907735, + "grad_norm": 0.45209529995918274, + "learning_rate": 0.0002, + "loss": 1.6326, + "step": 3430 + }, + { + "epoch": 2.5700410907732536, + "grad_norm": 0.4344733655452728, + "learning_rate": 0.0002, + "loss": 1.5606, + "step": 3440 + }, + { + "epoch": 2.577512140455734, + "grad_norm": 0.47435566782951355, + "learning_rate": 0.0002, + "loss": 1.6748, + "step": 3450 + }, + { + "epoch": 2.5849831901382143, + "grad_norm": 0.43841999769210815, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 3460 + }, + { + "epoch": 2.592454239820695, + "grad_norm": 0.4323869049549103, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 3470 + }, + { + "epoch": 2.599925289503175, + "grad_norm": 0.44355881214141846, + "learning_rate": 0.0002, + "loss": 1.5494, + "step": 3480 + }, + { + "epoch": 2.6073963391856556, + "grad_norm": 0.45847779512405396, + "learning_rate": 0.0002, + "loss": 1.665, + "step": 3490 + }, + { + "epoch": 2.614867388868136, + "grad_norm": 0.4411061704158783, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3500 + }, + { + "epoch": 2.6223384385506163, + "grad_norm": 0.4446796178817749, + "learning_rate": 0.0002, + "loss": 1.5868, + "step": 3510 + }, + { + "epoch": 2.629809488233097, + "grad_norm": 0.41969653964042664, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3520 + }, + { + "epoch": 2.637280537915577, + "grad_norm": 0.5263747572898865, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 3530 + }, + { + "epoch": 2.6447515875980576, + "grad_norm": 0.47719451785087585, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3540 + }, + { + "epoch": 2.6522226372805378, + "grad_norm": 0.46574118733406067, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 3550 + }, + { + "epoch": 2.6596936869630183, + "grad_norm": 0.46867135167121887, + "learning_rate": 0.0002, + "loss": 1.618, + "step": 3560 + }, + { + "epoch": 2.667164736645499, + "grad_norm": 0.4441198706626892, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3570 + }, + { + "epoch": 2.674635786327979, + "grad_norm": 0.4871319830417633, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 3580 + }, + { + "epoch": 2.6821068360104596, + "grad_norm": 0.43900373578071594, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 3590 + }, + { + "epoch": 2.6895778856929398, + "grad_norm": 0.42509549856185913, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 3600 + }, + { + "epoch": 2.6970489353754203, + "grad_norm": 0.4691086709499359, + "learning_rate": 0.0002, + "loss": 1.5651, + "step": 3610 + }, + { + "epoch": 2.7045199850579005, + "grad_norm": 0.46318942308425903, + "learning_rate": 0.0002, + "loss": 1.5491, + "step": 3620 + }, + { + "epoch": 2.711991034740381, + "grad_norm": 0.44631096720695496, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3630 + }, + { + "epoch": 2.7194620844228616, + "grad_norm": 0.42315489053726196, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3640 + }, + { + "epoch": 2.7269331341053418, + "grad_norm": 0.4971241056919098, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 3650 + }, + { + "epoch": 2.7344041837878224, + "grad_norm": 0.4578486382961273, + "learning_rate": 0.0002, + "loss": 1.6042, + "step": 3660 + }, + { + "epoch": 2.7418752334703025, + "grad_norm": 0.46584776043891907, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 3670 + }, + { + "epoch": 2.749346283152783, + "grad_norm": 0.4951731264591217, + "learning_rate": 0.0002, + "loss": 1.6809, + "step": 3680 + }, + { + "epoch": 2.756817332835263, + "grad_norm": 0.4935225546360016, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 3690 + }, + { + "epoch": 2.764288382517744, + "grad_norm": 0.41805586218833923, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3700 + }, + { + "epoch": 2.7717594322002244, + "grad_norm": 0.4417555630207062, + "learning_rate": 0.0002, + "loss": 1.7173, + "step": 3710 + }, + { + "epoch": 2.7792304818827045, + "grad_norm": 0.48229655623435974, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 3720 + }, + { + "epoch": 2.786701531565185, + "grad_norm": 0.48562315106391907, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3730 + }, + { + "epoch": 2.794172581247665, + "grad_norm": 0.4473940432071686, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 3740 + }, + { + "epoch": 2.801643630930146, + "grad_norm": 0.4626813232898712, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3750 + }, + { + "epoch": 2.809114680612626, + "grad_norm": 0.4339792728424072, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 3760 + }, + { + "epoch": 2.8165857302951065, + "grad_norm": 0.5250858068466187, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 3770 + }, + { + "epoch": 2.824056779977587, + "grad_norm": 0.4537523090839386, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3780 + }, + { + "epoch": 2.831527829660067, + "grad_norm": 0.5646113157272339, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3790 + }, + { + "epoch": 2.8389988793425474, + "grad_norm": 0.44243332743644714, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 3800 + }, + { + "epoch": 2.846469929025028, + "grad_norm": 0.4585791826248169, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3810 + }, + { + "epoch": 2.8539409787075085, + "grad_norm": 0.489702045917511, + "learning_rate": 0.0002, + "loss": 1.6854, + "step": 3820 + }, + { + "epoch": 2.8614120283899886, + "grad_norm": 0.502470850944519, + "learning_rate": 0.0002, + "loss": 1.7066, + "step": 3830 + }, + { + "epoch": 2.8688830780724692, + "grad_norm": 0.4395960867404938, + "learning_rate": 0.0002, + "loss": 1.5785, + "step": 3840 + }, + { + "epoch": 2.87635412775495, + "grad_norm": 0.4348670244216919, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3850 + }, + { + "epoch": 2.88382517743743, + "grad_norm": 0.48852720856666565, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3860 + }, + { + "epoch": 2.89129622711991, + "grad_norm": 0.45317450165748596, + "learning_rate": 0.0002, + "loss": 1.5916, + "step": 3870 + }, + { + "epoch": 2.8987672768023907, + "grad_norm": 0.4732758700847626, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 3880 + }, + { + "epoch": 2.9062383264848712, + "grad_norm": 0.45238012075424194, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 3890 + }, + { + "epoch": 2.9137093761673514, + "grad_norm": 0.48838064074516296, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 3900 + }, + { + "epoch": 2.921180425849832, + "grad_norm": 0.43496349453926086, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 3910 + }, + { + "epoch": 2.9286514755323125, + "grad_norm": 0.47963935136795044, + "learning_rate": 0.0002, + "loss": 1.7063, + "step": 3920 + }, + { + "epoch": 2.9361225252147927, + "grad_norm": 0.4544987976551056, + "learning_rate": 0.0002, + "loss": 1.6553, + "step": 3930 + }, + { + "epoch": 2.943593574897273, + "grad_norm": 0.4622892141342163, + "learning_rate": 0.0002, + "loss": 1.6192, + "step": 3940 + }, + { + "epoch": 2.9510646245797534, + "grad_norm": 0.47026222944259644, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 3950 + }, + { + "epoch": 2.958535674262234, + "grad_norm": 0.4549552798271179, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 3960 + }, + { + "epoch": 2.966006723944714, + "grad_norm": 0.46647515892982483, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3970 + }, + { + "epoch": 2.9734777736271947, + "grad_norm": 0.45095112919807434, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 3980 + }, + { + "epoch": 2.9809488233096753, + "grad_norm": 0.4690017104148865, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 3990 + }, + { + "epoch": 2.9884198729921554, + "grad_norm": 0.4603444039821625, + "learning_rate": 0.0002, + "loss": 1.6061, + "step": 4000 + }, + { + "epoch": 2.9958909226746355, + "grad_norm": 0.4743294417858124, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 4010 + }, + { + "epoch": 2.999626447515876, + "eval_loss": 1.8252571821212769, + "eval_runtime": 38.7853, + "eval_samples_per_second": 13.278, + "eval_steps_per_second": 1.676, + "step": 4015 + }, + { + "epoch": 3.003361972357116, + "grad_norm": 0.4919724464416504, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 4020 + }, + { + "epoch": 3.0108330220395967, + "grad_norm": 0.4747185707092285, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4030 + }, + { + "epoch": 3.018304071722077, + "grad_norm": 0.4797595143318176, + "learning_rate": 0.0002, + "loss": 1.568, + "step": 4040 + }, + { + "epoch": 3.0257751214045574, + "grad_norm": 0.5450999140739441, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 4050 + }, + { + "epoch": 3.0332461710870375, + "grad_norm": 0.49058812856674194, + "learning_rate": 0.0002, + "loss": 1.5065, + "step": 4060 + }, + { + "epoch": 3.040717220769518, + "grad_norm": 0.5219563841819763, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4070 + }, + { + "epoch": 3.0481882704519987, + "grad_norm": 0.515628457069397, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 4080 + }, + { + "epoch": 3.055659320134479, + "grad_norm": 0.6145984530448914, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 4090 + }, + { + "epoch": 3.0631303698169594, + "grad_norm": 0.6067144274711609, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 4100 + }, + { + "epoch": 3.0706014194994395, + "grad_norm": 0.5773133039474487, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 4110 + }, + { + "epoch": 3.07807246918192, + "grad_norm": 0.6894241571426392, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 4120 + }, + { + "epoch": 3.0855435188644003, + "grad_norm": 0.6422514915466309, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4130 + }, + { + "epoch": 3.093014568546881, + "grad_norm": 0.6119855046272278, + "learning_rate": 0.0002, + "loss": 1.4724, + "step": 4140 + }, + { + "epoch": 3.1004856182293614, + "grad_norm": 0.5847280025482178, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 4150 + }, + { + "epoch": 3.1079566679118416, + "grad_norm": 0.5401515960693359, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4160 + }, + { + "epoch": 3.115427717594322, + "grad_norm": 0.6501587629318237, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 4170 + }, + { + "epoch": 3.1228987672768023, + "grad_norm": 0.5988039374351501, + "learning_rate": 0.0002, + "loss": 1.4952, + "step": 4180 + }, + { + "epoch": 3.130369816959283, + "grad_norm": 0.4982665181159973, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 4190 + }, + { + "epoch": 3.137840866641763, + "grad_norm": 0.5548039078712463, + "learning_rate": 0.0002, + "loss": 1.5078, + "step": 4200 + }, + { + "epoch": 3.1453119163242436, + "grad_norm": 0.5920777320861816, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 4210 + }, + { + "epoch": 3.152782966006724, + "grad_norm": 0.6965190172195435, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 4220 + }, + { + "epoch": 3.1602540156892043, + "grad_norm": 0.5196244716644287, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 4230 + }, + { + "epoch": 3.167725065371685, + "grad_norm": 0.6942682266235352, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 4240 + }, + { + "epoch": 3.175196115054165, + "grad_norm": 0.5765156149864197, + "learning_rate": 0.0002, + "loss": 1.5407, + "step": 4250 + }, + { + "epoch": 3.1826671647366456, + "grad_norm": 0.5801976919174194, + "learning_rate": 0.0002, + "loss": 1.4963, + "step": 4260 + }, + { + "epoch": 3.1901382144191257, + "grad_norm": 0.6260752081871033, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4270 + }, + { + "epoch": 3.1976092641016063, + "grad_norm": 0.6610770225524902, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 4280 + }, + { + "epoch": 3.205080313784087, + "grad_norm": 0.5762143135070801, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 4290 + }, + { + "epoch": 3.212551363466567, + "grad_norm": 0.5926990509033203, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 4300 + }, + { + "epoch": 3.2200224131490476, + "grad_norm": 0.7373854517936707, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 4310 + }, + { + "epoch": 3.2274934628315277, + "grad_norm": 0.5963311195373535, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 4320 + }, + { + "epoch": 3.2349645125140083, + "grad_norm": 0.5754616856575012, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 4330 + }, + { + "epoch": 3.2424355621964884, + "grad_norm": 0.6116095781326294, + "learning_rate": 0.0002, + "loss": 1.4767, + "step": 4340 + }, + { + "epoch": 3.249906611878969, + "grad_norm": 0.6001536846160889, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 4350 + }, + { + "epoch": 3.257377661561449, + "grad_norm": 0.5270227789878845, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 4360 + }, + { + "epoch": 3.2648487112439297, + "grad_norm": 0.6666602492332458, + "learning_rate": 0.0002, + "loss": 1.5235, + "step": 4370 + }, + { + "epoch": 3.2723197609264103, + "grad_norm": 0.520310640335083, + "learning_rate": 0.0002, + "loss": 1.5665, + "step": 4380 + }, + { + "epoch": 3.2797908106088904, + "grad_norm": 0.5165975093841553, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 4390 + }, + { + "epoch": 3.287261860291371, + "grad_norm": 0.6080228686332703, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4400 + }, + { + "epoch": 3.294732909973851, + "grad_norm": 0.670122504234314, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 4410 + }, + { + "epoch": 3.3022039596563317, + "grad_norm": 0.6019457578659058, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 4420 + }, + { + "epoch": 3.309675009338812, + "grad_norm": 0.5519300103187561, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 4430 + }, + { + "epoch": 3.3171460590212924, + "grad_norm": 0.5958521962165833, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 4440 + }, + { + "epoch": 3.324617108703773, + "grad_norm": 0.5552705526351929, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4450 + }, + { + "epoch": 3.332088158386253, + "grad_norm": 0.6583784818649292, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 4460 + }, + { + "epoch": 3.3395592080687337, + "grad_norm": 0.5815939903259277, + "learning_rate": 0.0002, + "loss": 1.4206, + "step": 4470 + }, + { + "epoch": 3.347030257751214, + "grad_norm": 1.3342205286026, + "learning_rate": 0.0002, + "loss": 1.5942, + "step": 4480 + }, + { + "epoch": 3.3545013074336945, + "grad_norm": 0.6341500878334045, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 4490 + }, + { + "epoch": 3.3619723571161746, + "grad_norm": 0.6384079456329346, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 4500 + }, + { + "epoch": 3.369443406798655, + "grad_norm": 0.6098346710205078, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 4510 + }, + { + "epoch": 3.3769144564811358, + "grad_norm": 0.5958296656608582, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4520 + }, + { + "epoch": 3.384385506163616, + "grad_norm": 0.6157881617546082, + "learning_rate": 0.0002, + "loss": 1.5171, + "step": 4530 + }, + { + "epoch": 3.3918565558460965, + "grad_norm": 0.5671007037162781, + "learning_rate": 0.0002, + "loss": 1.569, + "step": 4540 + }, + { + "epoch": 3.3993276055285766, + "grad_norm": 0.6203294992446899, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 4550 + }, + { + "epoch": 3.406798655211057, + "grad_norm": 0.6743317246437073, + "learning_rate": 0.0002, + "loss": 1.5364, + "step": 4560 + }, + { + "epoch": 3.4142697048935373, + "grad_norm": 0.731765627861023, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4570 + }, + { + "epoch": 3.421740754576018, + "grad_norm": 0.6285187602043152, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 4580 + }, + { + "epoch": 3.4292118042584985, + "grad_norm": 0.612680196762085, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 4590 + }, + { + "epoch": 3.4366828539409786, + "grad_norm": 0.6413681507110596, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 4600 + }, + { + "epoch": 3.444153903623459, + "grad_norm": 0.6240990161895752, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 4610 + }, + { + "epoch": 3.4516249533059393, + "grad_norm": 0.5095735192298889, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 4620 + }, + { + "epoch": 3.45909600298842, + "grad_norm": 0.5699611902236938, + "learning_rate": 0.0002, + "loss": 1.4906, + "step": 4630 + }, + { + "epoch": 3.4665670526709, + "grad_norm": 0.7289775609970093, + "learning_rate": 0.0002, + "loss": 1.5176, + "step": 4640 + }, + { + "epoch": 3.4740381023533806, + "grad_norm": 0.6211609840393066, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 4650 + }, + { + "epoch": 3.481509152035861, + "grad_norm": 0.5714802145957947, + "learning_rate": 0.0002, + "loss": 1.533, + "step": 4660 + }, + { + "epoch": 3.4889802017183413, + "grad_norm": 0.6287049651145935, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 4670 + }, + { + "epoch": 3.496451251400822, + "grad_norm": 0.5480595827102661, + "learning_rate": 0.0002, + "loss": 1.4212, + "step": 4680 + }, + { + "epoch": 3.503922301083302, + "grad_norm": 0.5683253407478333, + "learning_rate": 0.0002, + "loss": 1.4746, + "step": 4690 + }, + { + "epoch": 3.5113933507657826, + "grad_norm": 0.601140558719635, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 4700 + }, + { + "epoch": 3.5188644004482628, + "grad_norm": 0.5344498157501221, + "learning_rate": 0.0002, + "loss": 1.5383, + "step": 4710 + }, + { + "epoch": 3.5263354501307433, + "grad_norm": 0.5739690661430359, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4720 + }, + { + "epoch": 3.533806499813224, + "grad_norm": 0.5640085935592651, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 4730 + }, + { + "epoch": 3.541277549495704, + "grad_norm": 0.5967805981636047, + "learning_rate": 0.0002, + "loss": 1.487, + "step": 4740 + }, + { + "epoch": 3.5487485991781846, + "grad_norm": 0.6138835549354553, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4750 + }, + { + "epoch": 3.5562196488606648, + "grad_norm": 0.6779900193214417, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 4760 + }, + { + "epoch": 3.5636906985431454, + "grad_norm": 0.6122010350227356, + "learning_rate": 0.0002, + "loss": 1.4917, + "step": 4770 + }, + { + "epoch": 3.5711617482256255, + "grad_norm": 0.5685241222381592, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4780 + }, + { + "epoch": 3.578632797908106, + "grad_norm": 0.604583203792572, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 4790 + }, + { + "epoch": 3.5861038475905866, + "grad_norm": 0.651165246963501, + "learning_rate": 0.0002, + "loss": 1.4514, + "step": 4800 + }, + { + "epoch": 3.593574897273067, + "grad_norm": 0.6398511528968811, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 4810 + }, + { + "epoch": 3.6010459469555474, + "grad_norm": 0.6444641351699829, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 4820 + }, + { + "epoch": 3.6085169966380275, + "grad_norm": 0.6018481850624084, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 4830 + }, + { + "epoch": 3.615988046320508, + "grad_norm": 0.6025291085243225, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 4840 + }, + { + "epoch": 3.623459096002988, + "grad_norm": 0.6810156106948853, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 4850 + }, + { + "epoch": 3.630930145685469, + "grad_norm": 0.6408044695854187, + "learning_rate": 0.0002, + "loss": 1.5299, + "step": 4860 + }, + { + "epoch": 3.6384011953679494, + "grad_norm": 0.5608272552490234, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4870 + }, + { + "epoch": 3.6458722450504295, + "grad_norm": 0.6136814951896667, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 4880 + }, + { + "epoch": 3.65334329473291, + "grad_norm": 0.5927900075912476, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4890 + }, + { + "epoch": 3.66081434441539, + "grad_norm": 0.5336901545524597, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 4900 + }, + { + "epoch": 3.668285394097871, + "grad_norm": 0.7823320627212524, + "learning_rate": 0.0002, + "loss": 1.5701, + "step": 4910 + }, + { + "epoch": 3.675756443780351, + "grad_norm": 0.6703504323959351, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 4920 + }, + { + "epoch": 3.6832274934628315, + "grad_norm": 0.6061160564422607, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 4930 + }, + { + "epoch": 3.690698543145312, + "grad_norm": 0.6237227916717529, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 4940 + }, + { + "epoch": 3.6981695928277922, + "grad_norm": 0.5985278487205505, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 4950 + }, + { + "epoch": 3.705640642510273, + "grad_norm": 0.6483839750289917, + "learning_rate": 0.0002, + "loss": 1.5132, + "step": 4960 + }, + { + "epoch": 3.713111692192753, + "grad_norm": 0.5788805484771729, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 4970 + }, + { + "epoch": 3.7205827418752335, + "grad_norm": 0.5609974265098572, + "learning_rate": 0.0002, + "loss": 1.5258, + "step": 4980 + }, + { + "epoch": 3.7280537915577137, + "grad_norm": 0.5681300759315491, + "learning_rate": 0.0002, + "loss": 1.4759, + "step": 4990 + }, + { + "epoch": 3.7355248412401942, + "grad_norm": 0.5860186219215393, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 5000 + }, + { + "epoch": 3.742995890922675, + "grad_norm": 0.5718157291412354, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 5010 + }, + { + "epoch": 3.750466940605155, + "grad_norm": 0.6173721551895142, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 5020 + }, + { + "epoch": 3.7579379902876355, + "grad_norm": 0.629152238368988, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 5030 + }, + { + "epoch": 3.7654090399701157, + "grad_norm": 0.5666284561157227, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 5040 + }, + { + "epoch": 3.7728800896525962, + "grad_norm": 0.6053005456924438, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5050 + }, + { + "epoch": 3.7803511393350764, + "grad_norm": 0.5870583057403564, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 5060 + }, + { + "epoch": 3.787822189017557, + "grad_norm": 0.5422009229660034, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5070 + }, + { + "epoch": 3.7952932387000375, + "grad_norm": 0.5396918058395386, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 5080 + }, + { + "epoch": 3.8027642883825177, + "grad_norm": 0.5544713139533997, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 5090 + }, + { + "epoch": 3.8102353380649983, + "grad_norm": 0.5983749628067017, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5100 + }, + { + "epoch": 3.8177063877474784, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 5110 + }, + { + "epoch": 3.825177437429959, + "grad_norm": 0.5436882376670837, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 5120 + }, + { + "epoch": 3.832648487112439, + "grad_norm": 0.5453617572784424, + "learning_rate": 0.0002, + "loss": 1.5118, + "step": 5130 + }, + { + "epoch": 3.8401195367949197, + "grad_norm": 0.6269069314002991, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 5140 + }, + { + "epoch": 3.8475905864774003, + "grad_norm": 0.6189185380935669, + "learning_rate": 0.0002, + "loss": 1.4959, + "step": 5150 + }, + { + "epoch": 3.8550616361598804, + "grad_norm": 0.6653388142585754, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 5160 + }, + { + "epoch": 3.862532685842361, + "grad_norm": 0.5771768689155579, + "learning_rate": 0.0002, + "loss": 1.5075, + "step": 5170 + }, + { + "epoch": 3.870003735524841, + "grad_norm": 0.6052790880203247, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5180 + }, + { + "epoch": 3.8774747852073217, + "grad_norm": 0.6572316884994507, + "learning_rate": 0.0002, + "loss": 1.4987, + "step": 5190 + }, + { + "epoch": 3.884945834889802, + "grad_norm": 0.670576810836792, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 5200 + }, + { + "epoch": 3.8924168845722824, + "grad_norm": 0.5728798508644104, + "learning_rate": 0.0002, + "loss": 1.4777, + "step": 5210 + }, + { + "epoch": 3.899887934254763, + "grad_norm": 0.6340774297714233, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 5220 + }, + { + "epoch": 3.907358983937243, + "grad_norm": 0.5981315970420837, + "learning_rate": 0.0002, + "loss": 1.5081, + "step": 5230 + }, + { + "epoch": 3.9148300336197237, + "grad_norm": 0.6212025880813599, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 5240 + }, + { + "epoch": 3.922301083302204, + "grad_norm": 0.6202296018600464, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5250 + }, + { + "epoch": 3.9297721329846844, + "grad_norm": 0.6159142255783081, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 5260 + }, + { + "epoch": 3.9372431826671646, + "grad_norm": 0.6519438624382019, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 5270 + }, + { + "epoch": 3.944714232349645, + "grad_norm": 0.539813756942749, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5280 + }, + { + "epoch": 3.9521852820321257, + "grad_norm": 0.6443665027618408, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 5290 + }, + { + "epoch": 3.959656331714606, + "grad_norm": 0.6635757684707642, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 5300 + }, + { + "epoch": 3.9671273813970864, + "grad_norm": 0.589363157749176, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 5310 + }, + { + "epoch": 3.9745984310795666, + "grad_norm": 0.5788735747337341, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 5320 + }, + { + "epoch": 3.982069480762047, + "grad_norm": 0.5976864695549011, + "learning_rate": 0.0002, + "loss": 1.5607, + "step": 5330 + }, + { + "epoch": 3.9895405304445273, + "grad_norm": 0.6624067425727844, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 5340 + }, + { + "epoch": 3.997011580127008, + "grad_norm": 0.6738956570625305, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 5350 + }, + { + "epoch": 4.0, + "eval_loss": 1.868006944656372, + "eval_runtime": 38.5153, + "eval_samples_per_second": 13.371, + "eval_steps_per_second": 1.688, + "step": 5354 + }, + { + "epoch": 4.004482629809488, + "grad_norm": 0.6023468971252441, + "learning_rate": 0.0002, + "loss": 1.4535, + "step": 5360 + }, + { + "epoch": 4.011953679491969, + "grad_norm": 0.8589285612106323, + "learning_rate": 0.0002, + "loss": 1.3987, + "step": 5370 + }, + { + "epoch": 4.019424729174449, + "grad_norm": 0.7477491497993469, + "learning_rate": 0.0002, + "loss": 1.3952, + "step": 5380 + }, + { + "epoch": 4.02689577885693, + "grad_norm": 0.7601922154426575, + "learning_rate": 0.0002, + "loss": 1.3745, + "step": 5390 + }, + { + "epoch": 4.03436682853941, + "grad_norm": 0.8115614056587219, + "learning_rate": 0.0002, + "loss": 1.4133, + "step": 5400 + }, + { + "epoch": 4.04183787822189, + "grad_norm": 0.669925332069397, + "learning_rate": 0.0002, + "loss": 1.3748, + "step": 5410 + }, + { + "epoch": 4.04930892790437, + "grad_norm": 0.8091904520988464, + "learning_rate": 0.0002, + "loss": 1.2835, + "step": 5420 + }, + { + "epoch": 4.056779977586851, + "grad_norm": 0.709405779838562, + "learning_rate": 0.0002, + "loss": 1.3615, + "step": 5430 + }, + { + "epoch": 4.064251027269331, + "grad_norm": 1.0006179809570312, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 5440 + }, + { + "epoch": 4.071722076951811, + "grad_norm": 0.7017965912818909, + "learning_rate": 0.0002, + "loss": 1.3491, + "step": 5450 + }, + { + "epoch": 4.0791931266342925, + "grad_norm": 0.8991572260856628, + "learning_rate": 0.0002, + "loss": 1.3642, + "step": 5460 + }, + { + "epoch": 4.086664176316773, + "grad_norm": 0.9064797759056091, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 5470 + }, + { + "epoch": 4.094135225999253, + "grad_norm": 0.7981749176979065, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 5480 + }, + { + "epoch": 4.101606275681733, + "grad_norm": 0.7280883193016052, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 5490 + }, + { + "epoch": 4.109077325364214, + "grad_norm": 0.7419600486755371, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 5500 + }, + { + "epoch": 4.116548375046694, + "grad_norm": 0.8019949197769165, + "learning_rate": 0.0002, + "loss": 1.3199, + "step": 5510 + }, + { + "epoch": 4.124019424729174, + "grad_norm": 0.7501229047775269, + "learning_rate": 0.0002, + "loss": 1.3133, + "step": 5520 + }, + { + "epoch": 4.131490474411655, + "grad_norm": 0.8166249990463257, + "learning_rate": 0.0002, + "loss": 1.4432, + "step": 5530 + }, + { + "epoch": 4.138961524094135, + "grad_norm": 0.9728496074676514, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 5540 + }, + { + "epoch": 4.1464325737766154, + "grad_norm": 0.7590922117233276, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 5550 + }, + { + "epoch": 4.153903623459096, + "grad_norm": 0.7759010791778564, + "learning_rate": 0.0002, + "loss": 1.4368, + "step": 5560 + }, + { + "epoch": 4.161374673141577, + "grad_norm": 0.9057986736297607, + "learning_rate": 0.0002, + "loss": 1.3635, + "step": 5570 + }, + { + "epoch": 4.168845722824057, + "grad_norm": 0.8853937983512878, + "learning_rate": 0.0002, + "loss": 1.4152, + "step": 5580 + }, + { + "epoch": 4.176316772506537, + "grad_norm": 0.7070684432983398, + "learning_rate": 0.0002, + "loss": 1.3633, + "step": 5590 + }, + { + "epoch": 4.183787822189018, + "grad_norm": 0.7649410963058472, + "learning_rate": 0.0002, + "loss": 1.3218, + "step": 5600 + }, + { + "epoch": 4.191258871871498, + "grad_norm": 1.2048029899597168, + "learning_rate": 0.0002, + "loss": 1.3857, + "step": 5610 + }, + { + "epoch": 4.198729921553978, + "grad_norm": 0.7986605763435364, + "learning_rate": 0.0002, + "loss": 1.3629, + "step": 5620 + }, + { + "epoch": 4.206200971236458, + "grad_norm": 0.8151885867118835, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 5630 + }, + { + "epoch": 4.213672020918939, + "grad_norm": 0.7719064354896545, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 5640 + }, + { + "epoch": 4.2211430706014195, + "grad_norm": 0.8422448039054871, + "learning_rate": 0.0002, + "loss": 1.3852, + "step": 5650 + }, + { + "epoch": 4.2286141202839, + "grad_norm": 0.7017164826393127, + "learning_rate": 0.0002, + "loss": 1.3321, + "step": 5660 + }, + { + "epoch": 4.236085169966381, + "grad_norm": 0.8559677600860596, + "learning_rate": 0.0002, + "loss": 1.4105, + "step": 5670 + }, + { + "epoch": 4.243556219648861, + "grad_norm": 0.8216157555580139, + "learning_rate": 0.0002, + "loss": 1.3701, + "step": 5680 + }, + { + "epoch": 4.251027269331341, + "grad_norm": 0.7681755423545837, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 5690 + }, + { + "epoch": 4.258498319013821, + "grad_norm": 0.811665952205658, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 5700 + }, + { + "epoch": 4.265969368696302, + "grad_norm": 0.7242204546928406, + "learning_rate": 0.0002, + "loss": 1.4161, + "step": 5710 + }, + { + "epoch": 4.273440418378782, + "grad_norm": 0.7570181488990784, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 5720 + }, + { + "epoch": 4.280911468061262, + "grad_norm": 0.8951969146728516, + "learning_rate": 0.0002, + "loss": 1.4265, + "step": 5730 + }, + { + "epoch": 4.288382517743743, + "grad_norm": 0.7222902178764343, + "learning_rate": 0.0002, + "loss": 1.3895, + "step": 5740 + }, + { + "epoch": 4.2958535674262235, + "grad_norm": 0.8508469462394714, + "learning_rate": 0.0002, + "loss": 1.4155, + "step": 5750 + }, + { + "epoch": 4.303324617108704, + "grad_norm": 0.7215430736541748, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 5760 + }, + { + "epoch": 4.310795666791184, + "grad_norm": 0.8774884939193726, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 5770 + }, + { + "epoch": 4.318266716473665, + "grad_norm": 0.8354552984237671, + "learning_rate": 0.0002, + "loss": 1.427, + "step": 5780 + }, + { + "epoch": 4.325737766156145, + "grad_norm": 0.6938814520835876, + "learning_rate": 0.0002, + "loss": 1.3222, + "step": 5790 + }, + { + "epoch": 4.333208815838625, + "grad_norm": 0.78675377368927, + "learning_rate": 0.0002, + "loss": 1.3589, + "step": 5800 + }, + { + "epoch": 4.340679865521106, + "grad_norm": 0.7147697806358337, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 5810 + }, + { + "epoch": 4.348150915203586, + "grad_norm": 0.7693623304367065, + "learning_rate": 0.0002, + "loss": 1.3597, + "step": 5820 + }, + { + "epoch": 4.355621964886066, + "grad_norm": 0.856517493724823, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 5830 + }, + { + "epoch": 4.3630930145685465, + "grad_norm": 0.7200973033905029, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 5840 + }, + { + "epoch": 4.3705640642510275, + "grad_norm": 0.743281364440918, + "learning_rate": 0.0002, + "loss": 1.442, + "step": 5850 + }, + { + "epoch": 4.378035113933508, + "grad_norm": 0.7627727389335632, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 5860 + }, + { + "epoch": 4.385506163615988, + "grad_norm": 0.7238836884498596, + "learning_rate": 0.0002, + "loss": 1.4082, + "step": 5870 + }, + { + "epoch": 4.392977213298469, + "grad_norm": 0.7253410816192627, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 5880 + }, + { + "epoch": 4.400448262980949, + "grad_norm": 0.8232238292694092, + "learning_rate": 0.0002, + "loss": 1.3774, + "step": 5890 + }, + { + "epoch": 4.407919312663429, + "grad_norm": 0.8778504729270935, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 5900 + }, + { + "epoch": 4.415390362345909, + "grad_norm": 0.7639474868774414, + "learning_rate": 0.0002, + "loss": 1.387, + "step": 5910 + }, + { + "epoch": 4.42286141202839, + "grad_norm": 0.7666519284248352, + "learning_rate": 0.0002, + "loss": 1.3862, + "step": 5920 + }, + { + "epoch": 4.43033246171087, + "grad_norm": 0.867132842540741, + "learning_rate": 0.0002, + "loss": 1.4168, + "step": 5930 + }, + { + "epoch": 4.4378035113933505, + "grad_norm": 0.7571166753768921, + "learning_rate": 0.0002, + "loss": 1.4772, + "step": 5940 + }, + { + "epoch": 4.4452745610758315, + "grad_norm": 0.7911370992660522, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 5950 + }, + { + "epoch": 4.452745610758312, + "grad_norm": 0.8844250440597534, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 5960 + }, + { + "epoch": 4.460216660440792, + "grad_norm": 0.7336231470108032, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 5970 + }, + { + "epoch": 4.467687710123272, + "grad_norm": 0.8162738084793091, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 5980 + }, + { + "epoch": 4.475158759805753, + "grad_norm": 0.7413017153739929, + "learning_rate": 0.0002, + "loss": 1.393, + "step": 5990 + }, + { + "epoch": 4.482629809488233, + "grad_norm": 0.7215432524681091, + "learning_rate": 0.0002, + "loss": 1.3712, + "step": 6000 + }, + { + "epoch": 4.490100859170713, + "grad_norm": 0.8943389058113098, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 6010 + }, + { + "epoch": 4.497571908853194, + "grad_norm": 0.7850823998451233, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 6020 + }, + { + "epoch": 4.505042958535674, + "grad_norm": 0.8117504119873047, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 6030 + }, + { + "epoch": 4.5125140082181545, + "grad_norm": 0.8381605744361877, + "learning_rate": 0.0002, + "loss": 1.4272, + "step": 6040 + }, + { + "epoch": 4.519985057900635, + "grad_norm": 0.7964059710502625, + "learning_rate": 0.0002, + "loss": 1.3829, + "step": 6050 + }, + { + "epoch": 4.527456107583116, + "grad_norm": 0.7935128211975098, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 6060 + }, + { + "epoch": 4.534927157265596, + "grad_norm": 0.8725124597549438, + "learning_rate": 0.0002, + "loss": 1.3994, + "step": 6070 + }, + { + "epoch": 4.542398206948076, + "grad_norm": 0.880325198173523, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6080 + }, + { + "epoch": 4.549869256630557, + "grad_norm": 0.7220637202262878, + "learning_rate": 0.0002, + "loss": 1.4459, + "step": 6090 + }, + { + "epoch": 4.557340306313037, + "grad_norm": 0.6908547878265381, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 6100 + }, + { + "epoch": 4.564811355995517, + "grad_norm": 0.797931969165802, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 6110 + }, + { + "epoch": 4.572282405677997, + "grad_norm": 0.7056134343147278, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 6120 + }, + { + "epoch": 4.579753455360478, + "grad_norm": 0.7850478887557983, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 6130 + }, + { + "epoch": 4.5872245050429585, + "grad_norm": 0.8112621307373047, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 6140 + }, + { + "epoch": 4.594695554725439, + "grad_norm": 0.7040849328041077, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 6150 + }, + { + "epoch": 4.60216660440792, + "grad_norm": 0.7214553952217102, + "learning_rate": 0.0002, + "loss": 1.3526, + "step": 6160 + }, + { + "epoch": 4.6096376540904, + "grad_norm": 0.8616511821746826, + "learning_rate": 0.0002, + "loss": 1.3932, + "step": 6170 + }, + { + "epoch": 4.61710870377288, + "grad_norm": 0.8374658226966858, + "learning_rate": 0.0002, + "loss": 1.4622, + "step": 6180 + }, + { + "epoch": 4.62457975345536, + "grad_norm": 0.6761606931686401, + "learning_rate": 0.0002, + "loss": 1.3703, + "step": 6190 + }, + { + "epoch": 4.632050803137841, + "grad_norm": 0.768028199672699, + "learning_rate": 0.0002, + "loss": 1.3977, + "step": 6200 + }, + { + "epoch": 4.639521852820321, + "grad_norm": 0.9372717142105103, + "learning_rate": 0.0002, + "loss": 1.3772, + "step": 6210 + }, + { + "epoch": 4.646992902502801, + "grad_norm": 0.7906546592712402, + "learning_rate": 0.0002, + "loss": 1.4098, + "step": 6220 + }, + { + "epoch": 4.654463952185282, + "grad_norm": 0.7376723289489746, + "learning_rate": 0.0002, + "loss": 1.3962, + "step": 6230 + }, + { + "epoch": 4.6619350018677626, + "grad_norm": 0.8972630500793457, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 6240 + }, + { + "epoch": 4.669406051550243, + "grad_norm": 0.8261756300926208, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 6250 + }, + { + "epoch": 4.676877101232723, + "grad_norm": 0.7512393593788147, + "learning_rate": 0.0002, + "loss": 1.3267, + "step": 6260 + }, + { + "epoch": 4.684348150915204, + "grad_norm": 0.7132362127304077, + "learning_rate": 0.0002, + "loss": 1.4278, + "step": 6270 + }, + { + "epoch": 4.691819200597684, + "grad_norm": 0.7690575122833252, + "learning_rate": 0.0002, + "loss": 1.4299, + "step": 6280 + }, + { + "epoch": 4.699290250280164, + "grad_norm": 0.9886258840560913, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 6290 + }, + { + "epoch": 4.706761299962645, + "grad_norm": 0.9502435922622681, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 6300 + }, + { + "epoch": 4.714232349645125, + "grad_norm": 0.702255129814148, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 6310 + }, + { + "epoch": 4.721703399327605, + "grad_norm": 0.7713103890419006, + "learning_rate": 0.0002, + "loss": 1.4447, + "step": 6320 + }, + { + "epoch": 4.7291744490100855, + "grad_norm": 0.7778580784797668, + "learning_rate": 0.0002, + "loss": 1.4392, + "step": 6330 + }, + { + "epoch": 4.736645498692567, + "grad_norm": 0.7275111079216003, + "learning_rate": 0.0002, + "loss": 1.4169, + "step": 6340 + }, + { + "epoch": 4.744116548375047, + "grad_norm": 0.7728744149208069, + "learning_rate": 0.0002, + "loss": 1.4429, + "step": 6350 + }, + { + "epoch": 4.751587598057527, + "grad_norm": 0.9724260568618774, + "learning_rate": 0.0002, + "loss": 1.3756, + "step": 6360 + }, + { + "epoch": 4.759058647740007, + "grad_norm": 0.7505622506141663, + "learning_rate": 0.0002, + "loss": 1.3358, + "step": 6370 + }, + { + "epoch": 4.766529697422488, + "grad_norm": 0.7994682788848877, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 6380 + }, + { + "epoch": 4.774000747104968, + "grad_norm": 0.8432038426399231, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 6390 + }, + { + "epoch": 4.781471796787448, + "grad_norm": 0.7436022758483887, + "learning_rate": 0.0002, + "loss": 1.4606, + "step": 6400 + }, + { + "epoch": 4.788942846469929, + "grad_norm": 0.7709194421768188, + "learning_rate": 0.0002, + "loss": 1.3461, + "step": 6410 + }, + { + "epoch": 4.796413896152409, + "grad_norm": 0.8798436522483826, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 6420 + }, + { + "epoch": 4.80388494583489, + "grad_norm": 0.790189266204834, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 6430 + }, + { + "epoch": 4.811355995517371, + "grad_norm": 0.6824303865432739, + "learning_rate": 0.0002, + "loss": 1.4109, + "step": 6440 + }, + { + "epoch": 4.818827045199851, + "grad_norm": 0.7501044869422913, + "learning_rate": 0.0002, + "loss": 1.3877, + "step": 6450 + }, + { + "epoch": 4.826298094882331, + "grad_norm": 0.8840398192405701, + "learning_rate": 0.0002, + "loss": 1.4458, + "step": 6460 + }, + { + "epoch": 4.833769144564811, + "grad_norm": 0.7812688946723938, + "learning_rate": 0.0002, + "loss": 1.4412, + "step": 6470 + }, + { + "epoch": 4.841240194247292, + "grad_norm": 0.7429926991462708, + "learning_rate": 0.0002, + "loss": 1.4299, + "step": 6480 + }, + { + "epoch": 4.848711243929772, + "grad_norm": 0.7778021693229675, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 6490 + }, + { + "epoch": 4.856182293612252, + "grad_norm": 0.8270702362060547, + "learning_rate": 0.0002, + "loss": 1.4589, + "step": 6500 + }, + { + "epoch": 4.863653343294732, + "grad_norm": 0.6960513591766357, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 6510 + }, + { + "epoch": 4.8711243929772134, + "grad_norm": 0.7728942632675171, + "learning_rate": 0.0002, + "loss": 1.376, + "step": 6520 + }, + { + "epoch": 4.878595442659694, + "grad_norm": 0.7377303838729858, + "learning_rate": 0.0002, + "loss": 1.4852, + "step": 6530 + }, + { + "epoch": 4.886066492342174, + "grad_norm": 0.7257253527641296, + "learning_rate": 0.0002, + "loss": 1.3846, + "step": 6540 + }, + { + "epoch": 4.893537542024655, + "grad_norm": 0.7875821590423584, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 6550 + }, + { + "epoch": 4.901008591707135, + "grad_norm": 0.8346304297447205, + "learning_rate": 0.0002, + "loss": 1.357, + "step": 6560 + }, + { + "epoch": 4.908479641389615, + "grad_norm": 0.7710739374160767, + "learning_rate": 0.0002, + "loss": 1.4522, + "step": 6570 + }, + { + "epoch": 4.915950691072096, + "grad_norm": 0.7015138268470764, + "learning_rate": 0.0002, + "loss": 1.4465, + "step": 6580 + }, + { + "epoch": 4.923421740754576, + "grad_norm": 0.8707432150840759, + "learning_rate": 0.0002, + "loss": 1.435, + "step": 6590 + }, + { + "epoch": 4.930892790437056, + "grad_norm": 0.786601185798645, + "learning_rate": 0.0002, + "loss": 1.2968, + "step": 6600 + }, + { + "epoch": 4.938363840119536, + "grad_norm": 0.978519082069397, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 6610 + }, + { + "epoch": 4.9458348898020175, + "grad_norm": 0.8102927207946777, + "learning_rate": 0.0002, + "loss": 1.3997, + "step": 6620 + }, + { + "epoch": 4.953305939484498, + "grad_norm": 0.7628704309463501, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 6630 + }, + { + "epoch": 4.960776989166978, + "grad_norm": 0.8053455352783203, + "learning_rate": 0.0002, + "loss": 1.3774, + "step": 6640 + }, + { + "epoch": 4.968248038849458, + "grad_norm": 0.8680412173271179, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 6650 + }, + { + "epoch": 4.975719088531939, + "grad_norm": 0.7415758371353149, + "learning_rate": 0.0002, + "loss": 1.3978, + "step": 6660 + }, + { + "epoch": 4.983190138214419, + "grad_norm": 0.7730312347412109, + "learning_rate": 0.0002, + "loss": 1.3793, + "step": 6670 + }, + { + "epoch": 4.990661187896899, + "grad_norm": 0.7924041152000427, + "learning_rate": 0.0002, + "loss": 1.4863, + "step": 6680 + }, + { + "epoch": 4.99813223757938, + "grad_norm": 0.8677893877029419, + "learning_rate": 0.0002, + "loss": 1.4137, + "step": 6690 + }, + { + "epoch": 4.999626447515876, + "eval_loss": 1.9444633722305298, + "eval_runtime": 39.3488, + "eval_samples_per_second": 13.088, + "eval_steps_per_second": 1.652, + "step": 6692 + }, + { + "epoch": 5.00560328726186, + "grad_norm": 0.7102245092391968, + "learning_rate": 0.0002, + "loss": 1.3076, + "step": 6700 + }, + { + "epoch": 5.0130743369443405, + "grad_norm": 1.0425463914871216, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 6710 + }, + { + "epoch": 5.0205453866268215, + "grad_norm": 0.9320756793022156, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 6720 + }, + { + "epoch": 5.028016436309302, + "grad_norm": 0.8797217607498169, + "learning_rate": 0.0002, + "loss": 1.1786, + "step": 6730 + }, + { + "epoch": 5.035487485991782, + "grad_norm": 2.135707139968872, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 6740 + }, + { + "epoch": 5.042958535674262, + "grad_norm": 0.8747734427452087, + "learning_rate": 0.0002, + "loss": 1.1761, + "step": 6750 + }, + { + "epoch": 5.050429585356743, + "grad_norm": 0.9981076717376709, + "learning_rate": 0.0002, + "loss": 1.1675, + "step": 6760 + }, + { + "epoch": 5.057900635039223, + "grad_norm": 0.985078752040863, + "learning_rate": 0.0002, + "loss": 1.1976, + "step": 6770 + }, + { + "epoch": 5.065371684721703, + "grad_norm": 1.0974019765853882, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 6780 + }, + { + "epoch": 5.072842734404184, + "grad_norm": 0.9823219180107117, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6790 + }, + { + "epoch": 5.080313784086664, + "grad_norm": 1.122605562210083, + "learning_rate": 0.0002, + "loss": 1.2586, + "step": 6800 + }, + { + "epoch": 5.0877848337691445, + "grad_norm": 0.8556802272796631, + "learning_rate": 0.0002, + "loss": 1.2069, + "step": 6810 + }, + { + "epoch": 5.095255883451625, + "grad_norm": 1.1699262857437134, + "learning_rate": 0.0002, + "loss": 1.1908, + "step": 6820 + }, + { + "epoch": 5.102726933134106, + "grad_norm": 1.0440590381622314, + "learning_rate": 0.0002, + "loss": 1.1869, + "step": 6830 + }, + { + "epoch": 5.110197982816586, + "grad_norm": 1.0445300340652466, + "learning_rate": 0.0002, + "loss": 1.1655, + "step": 6840 + }, + { + "epoch": 5.117669032499066, + "grad_norm": 0.8289563059806824, + "learning_rate": 0.0002, + "loss": 1.2392, + "step": 6850 + }, + { + "epoch": 5.125140082181547, + "grad_norm": 1.1051193475723267, + "learning_rate": 0.0002, + "loss": 1.1687, + "step": 6860 + }, + { + "epoch": 5.132611131864027, + "grad_norm": 0.9345614910125732, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 6870 + }, + { + "epoch": 5.140082181546507, + "grad_norm": 1.1222996711730957, + "learning_rate": 0.0002, + "loss": 1.3021, + "step": 6880 + }, + { + "epoch": 5.147553231228987, + "grad_norm": 0.9405338764190674, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 6890 + }, + { + "epoch": 5.155024280911468, + "grad_norm": 1.0935171842575073, + "learning_rate": 0.0002, + "loss": 1.2367, + "step": 6900 + }, + { + "epoch": 5.1624953305939485, + "grad_norm": 1.0438612699508667, + "learning_rate": 0.0002, + "loss": 1.2458, + "step": 6910 + }, + { + "epoch": 5.169966380276429, + "grad_norm": 1.1189004182815552, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6920 + }, + { + "epoch": 5.17743742995891, + "grad_norm": 1.0533215999603271, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 6930 + }, + { + "epoch": 5.18490847964139, + "grad_norm": 0.9779648780822754, + "learning_rate": 0.0002, + "loss": 1.2974, + "step": 6940 + }, + { + "epoch": 5.19237952932387, + "grad_norm": 0.8920868635177612, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 6950 + }, + { + "epoch": 5.19985057900635, + "grad_norm": 0.8374548554420471, + "learning_rate": 0.0002, + "loss": 1.283, + "step": 6960 + }, + { + "epoch": 5.207321628688831, + "grad_norm": 1.0490682125091553, + "learning_rate": 0.0002, + "loss": 1.2775, + "step": 6970 + }, + { + "epoch": 5.214792678371311, + "grad_norm": 0.9658287167549133, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 6980 + }, + { + "epoch": 5.222263728053791, + "grad_norm": 0.9652056097984314, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 6990 + }, + { + "epoch": 5.229734777736272, + "grad_norm": 0.9141794443130493, + "learning_rate": 0.0002, + "loss": 1.3023, + "step": 7000 + }, + { + "epoch": 5.2372058274187525, + "grad_norm": 0.9831376671791077, + "learning_rate": 0.0002, + "loss": 1.2456, + "step": 7010 + }, + { + "epoch": 5.244676877101233, + "grad_norm": 1.0198718309402466, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 7020 + }, + { + "epoch": 5.252147926783713, + "grad_norm": 0.9647888541221619, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 7030 + }, + { + "epoch": 5.259618976466194, + "grad_norm": 1.3941649198532104, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 7040 + }, + { + "epoch": 5.267090026148674, + "grad_norm": 1.0305466651916504, + "learning_rate": 0.0002, + "loss": 1.2885, + "step": 7050 + }, + { + "epoch": 5.274561075831154, + "grad_norm": 0.9577859044075012, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 7060 + }, + { + "epoch": 5.282032125513634, + "grad_norm": 1.149092197418213, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 7070 + }, + { + "epoch": 5.289503175196115, + "grad_norm": 1.2582733631134033, + "learning_rate": 0.0002, + "loss": 1.2986, + "step": 7080 + }, + { + "epoch": 5.296974224878595, + "grad_norm": 1.1777442693710327, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 7090 + }, + { + "epoch": 5.3044452745610755, + "grad_norm": 1.0076404809951782, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 7100 + }, + { + "epoch": 5.3119163242435565, + "grad_norm": 0.9037365913391113, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 7110 + }, + { + "epoch": 5.319387373926037, + "grad_norm": 0.9428724646568298, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 7120 + }, + { + "epoch": 5.326858423608517, + "grad_norm": 0.9935154318809509, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 7130 + }, + { + "epoch": 5.334329473290998, + "grad_norm": 1.087500810623169, + "learning_rate": 0.0002, + "loss": 1.2833, + "step": 7140 + }, + { + "epoch": 5.341800522973478, + "grad_norm": 0.8543072938919067, + "learning_rate": 0.0002, + "loss": 1.2304, + "step": 7150 + }, + { + "epoch": 5.349271572655958, + "grad_norm": 0.9323700070381165, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 7160 + }, + { + "epoch": 5.356742622338438, + "grad_norm": 1.0037827491760254, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 7170 + }, + { + "epoch": 5.364213672020919, + "grad_norm": 0.8746469616889954, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 7180 + }, + { + "epoch": 5.371684721703399, + "grad_norm": 0.9516328573226929, + "learning_rate": 0.0002, + "loss": 1.2759, + "step": 7190 + }, + { + "epoch": 5.3791557713858795, + "grad_norm": 0.9395177364349365, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 7200 + }, + { + "epoch": 5.38662682106836, + "grad_norm": 1.000369906425476, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 7210 + }, + { + "epoch": 5.394097870750841, + "grad_norm": 1.0845502614974976, + "learning_rate": 0.0002, + "loss": 1.2337, + "step": 7220 + }, + { + "epoch": 5.401568920433321, + "grad_norm": 0.8975145220756531, + "learning_rate": 0.0002, + "loss": 1.2776, + "step": 7230 + }, + { + "epoch": 5.409039970115801, + "grad_norm": 1.040077805519104, + "learning_rate": 0.0002, + "loss": 1.2306, + "step": 7240 + }, + { + "epoch": 5.416511019798282, + "grad_norm": 1.0729942321777344, + "learning_rate": 0.0002, + "loss": 1.2277, + "step": 7250 + }, + { + "epoch": 5.423982069480762, + "grad_norm": 0.8322232961654663, + "learning_rate": 0.0002, + "loss": 1.2714, + "step": 7260 + }, + { + "epoch": 5.431453119163242, + "grad_norm": 1.0654641389846802, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 7270 + }, + { + "epoch": 5.438924168845723, + "grad_norm": 1.0445852279663086, + "learning_rate": 0.0002, + "loss": 1.268, + "step": 7280 + }, + { + "epoch": 5.446395218528203, + "grad_norm": 1.0762956142425537, + "learning_rate": 0.0002, + "loss": 1.2743, + "step": 7290 + }, + { + "epoch": 5.4538662682106835, + "grad_norm": 0.9721953868865967, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 7300 + }, + { + "epoch": 5.461337317893164, + "grad_norm": 0.9238539338111877, + "learning_rate": 0.0002, + "loss": 1.2833, + "step": 7310 + }, + { + "epoch": 5.468808367575645, + "grad_norm": 0.9912874102592468, + "learning_rate": 0.0002, + "loss": 1.255, + "step": 7320 + }, + { + "epoch": 5.476279417258125, + "grad_norm": 1.0727077722549438, + "learning_rate": 0.0002, + "loss": 1.2557, + "step": 7330 + }, + { + "epoch": 5.483750466940605, + "grad_norm": 0.8633865118026733, + "learning_rate": 0.0002, + "loss": 1.3471, + "step": 7340 + }, + { + "epoch": 5.491221516623085, + "grad_norm": 0.9396262764930725, + "learning_rate": 0.0002, + "loss": 1.3155, + "step": 7350 + }, + { + "epoch": 5.498692566305566, + "grad_norm": 1.0253715515136719, + "learning_rate": 0.0002, + "loss": 1.3146, + "step": 7360 + }, + { + "epoch": 5.506163615988046, + "grad_norm": 1.006047010421753, + "learning_rate": 0.0002, + "loss": 1.3156, + "step": 7370 + }, + { + "epoch": 5.513634665670526, + "grad_norm": 0.9781233072280884, + "learning_rate": 0.0002, + "loss": 1.3107, + "step": 7380 + }, + { + "epoch": 5.521105715353007, + "grad_norm": 0.9945126175880432, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 7390 + }, + { + "epoch": 5.528576765035488, + "grad_norm": 0.9081175327301025, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 7400 + }, + { + "epoch": 5.536047814717968, + "grad_norm": 1.2215938568115234, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 7410 + }, + { + "epoch": 5.543518864400449, + "grad_norm": 1.0724077224731445, + "learning_rate": 0.0002, + "loss": 1.2484, + "step": 7420 + }, + { + "epoch": 5.550989914082929, + "grad_norm": 1.106955885887146, + "learning_rate": 0.0002, + "loss": 1.3083, + "step": 7430 + }, + { + "epoch": 5.558460963765409, + "grad_norm": 1.0657650232315063, + "learning_rate": 0.0002, + "loss": 1.2125, + "step": 7440 + }, + { + "epoch": 5.565932013447889, + "grad_norm": 0.9725455641746521, + "learning_rate": 0.0002, + "loss": 1.2576, + "step": 7450 + }, + { + "epoch": 5.57340306313037, + "grad_norm": 0.8604224324226379, + "learning_rate": 0.0002, + "loss": 1.3297, + "step": 7460 + }, + { + "epoch": 5.58087411281285, + "grad_norm": 0.9913371205329895, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 7470 + }, + { + "epoch": 5.58834516249533, + "grad_norm": 1.012073040008545, + "learning_rate": 0.0002, + "loss": 1.3371, + "step": 7480 + }, + { + "epoch": 5.5958162121778106, + "grad_norm": 1.1003159284591675, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 7490 + }, + { + "epoch": 5.603287261860292, + "grad_norm": 0.9104593992233276, + "learning_rate": 0.0002, + "loss": 1.2577, + "step": 7500 + }, + { + "epoch": 5.610758311542772, + "grad_norm": 0.9480831623077393, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 7510 + }, + { + "epoch": 5.618229361225252, + "grad_norm": 1.0826456546783447, + "learning_rate": 0.0002, + "loss": 1.3056, + "step": 7520 + }, + { + "epoch": 5.625700410907733, + "grad_norm": 0.8286259174346924, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 7530 + }, + { + "epoch": 5.633171460590213, + "grad_norm": 0.9145061373710632, + "learning_rate": 0.0002, + "loss": 1.2918, + "step": 7540 + }, + { + "epoch": 5.640642510272693, + "grad_norm": 0.9363601803779602, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 7550 + }, + { + "epoch": 5.648113559955174, + "grad_norm": 0.9553244709968567, + "learning_rate": 0.0002, + "loss": 1.2265, + "step": 7560 + }, + { + "epoch": 5.655584609637654, + "grad_norm": 1.0343557596206665, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 7570 + }, + { + "epoch": 5.663055659320134, + "grad_norm": 0.8734238743782043, + "learning_rate": 0.0002, + "loss": 1.3171, + "step": 7580 + }, + { + "epoch": 5.670526709002615, + "grad_norm": 1.0230586528778076, + "learning_rate": 0.0002, + "loss": 1.2785, + "step": 7590 + }, + { + "epoch": 5.677997758685096, + "grad_norm": 1.0063409805297852, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 7600 + }, + { + "epoch": 5.685468808367576, + "grad_norm": 1.0104626417160034, + "learning_rate": 0.0002, + "loss": 1.2396, + "step": 7610 + }, + { + "epoch": 5.692939858050056, + "grad_norm": 0.9528168439865112, + "learning_rate": 0.0002, + "loss": 1.2581, + "step": 7620 + }, + { + "epoch": 5.700410907732536, + "grad_norm": 0.9799878597259521, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 7630 + }, + { + "epoch": 5.707881957415017, + "grad_norm": 0.969351589679718, + "learning_rate": 0.0002, + "loss": 1.2632, + "step": 7640 + }, + { + "epoch": 5.715353007097497, + "grad_norm": 1.3037652969360352, + "learning_rate": 0.0002, + "loss": 1.3055, + "step": 7650 + }, + { + "epoch": 5.722824056779977, + "grad_norm": 1.0640486478805542, + "learning_rate": 0.0002, + "loss": 1.3126, + "step": 7660 + }, + { + "epoch": 5.730295106462458, + "grad_norm": 1.0416420698165894, + "learning_rate": 0.0002, + "loss": 1.3325, + "step": 7670 + }, + { + "epoch": 5.7377661561449385, + "grad_norm": 0.8893619775772095, + "learning_rate": 0.0002, + "loss": 1.25, + "step": 7680 + }, + { + "epoch": 5.745237205827419, + "grad_norm": 0.8512844443321228, + "learning_rate": 0.0002, + "loss": 1.319, + "step": 7690 + }, + { + "epoch": 5.7527082555099, + "grad_norm": 0.9955748319625854, + "learning_rate": 0.0002, + "loss": 1.3328, + "step": 7700 + }, + { + "epoch": 5.76017930519238, + "grad_norm": 1.0409910678863525, + "learning_rate": 0.0002, + "loss": 1.294, + "step": 7710 + }, + { + "epoch": 5.76765035487486, + "grad_norm": 1.010097861289978, + "learning_rate": 0.0002, + "loss": 1.3518, + "step": 7720 + }, + { + "epoch": 5.77512140455734, + "grad_norm": 0.8974892497062683, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 7730 + }, + { + "epoch": 5.782592454239821, + "grad_norm": 0.972835123538971, + "learning_rate": 0.0002, + "loss": 1.2743, + "step": 7740 + }, + { + "epoch": 5.790063503922301, + "grad_norm": 0.9607440829277039, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 7750 + }, + { + "epoch": 5.797534553604781, + "grad_norm": 0.9426500797271729, + "learning_rate": 0.0002, + "loss": 1.29, + "step": 7760 + }, + { + "epoch": 5.8050056032872615, + "grad_norm": 0.8745320439338684, + "learning_rate": 0.0002, + "loss": 1.274, + "step": 7770 + }, + { + "epoch": 5.8124766529697425, + "grad_norm": 1.0117204189300537, + "learning_rate": 0.0002, + "loss": 1.3009, + "step": 7780 + }, + { + "epoch": 5.819947702652223, + "grad_norm": 1.0387755632400513, + "learning_rate": 0.0002, + "loss": 1.3135, + "step": 7790 + }, + { + "epoch": 5.827418752334703, + "grad_norm": 1.0709784030914307, + "learning_rate": 0.0002, + "loss": 1.2709, + "step": 7800 + }, + { + "epoch": 5.834889802017184, + "grad_norm": 0.9512667655944824, + "learning_rate": 0.0002, + "loss": 1.225, + "step": 7810 + }, + { + "epoch": 5.842360851699664, + "grad_norm": 1.021094560623169, + "learning_rate": 0.0002, + "loss": 1.3284, + "step": 7820 + }, + { + "epoch": 5.849831901382144, + "grad_norm": 1.117491364479065, + "learning_rate": 0.0002, + "loss": 1.2794, + "step": 7830 + }, + { + "epoch": 5.857302951064625, + "grad_norm": 0.9252554178237915, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 7840 + }, + { + "epoch": 5.864774000747105, + "grad_norm": 1.1416207551956177, + "learning_rate": 0.0002, + "loss": 1.2976, + "step": 7850 + }, + { + "epoch": 5.872245050429585, + "grad_norm": 1.1219907999038696, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 7860 + }, + { + "epoch": 5.8797161001120655, + "grad_norm": 0.8300467729568481, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 7870 + }, + { + "epoch": 5.8871871497945465, + "grad_norm": 1.00551438331604, + "learning_rate": 0.0002, + "loss": 1.3132, + "step": 7880 + }, + { + "epoch": 5.894658199477027, + "grad_norm": 0.8981153964996338, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 7890 + }, + { + "epoch": 5.902129249159507, + "grad_norm": 1.0247976779937744, + "learning_rate": 0.0002, + "loss": 1.2817, + "step": 7900 + }, + { + "epoch": 5.909600298841987, + "grad_norm": 1.0820319652557373, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 7910 + }, + { + "epoch": 5.917071348524468, + "grad_norm": 0.952675461769104, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 7920 + }, + { + "epoch": 5.924542398206948, + "grad_norm": 0.8666740655899048, + "learning_rate": 0.0002, + "loss": 1.307, + "step": 7930 + }, + { + "epoch": 5.932013447889428, + "grad_norm": 0.8640421032905579, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 7940 + }, + { + "epoch": 5.939484497571909, + "grad_norm": 1.2343276739120483, + "learning_rate": 0.0002, + "loss": 1.2386, + "step": 7950 + }, + { + "epoch": 5.946955547254389, + "grad_norm": 0.958046555519104, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 7960 + }, + { + "epoch": 5.9544265969368695, + "grad_norm": 1.0538510084152222, + "learning_rate": 0.0002, + "loss": 1.2352, + "step": 7970 + }, + { + "epoch": 5.9618976466193505, + "grad_norm": 1.2681571245193481, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 7980 + }, + { + "epoch": 5.969368696301831, + "grad_norm": 0.8171183466911316, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 7990 + }, + { + "epoch": 5.976839745984311, + "grad_norm": 0.9109523892402649, + "learning_rate": 0.0002, + "loss": 1.3412, + "step": 8000 + }, + { + "epoch": 5.984310795666791, + "grad_norm": 1.0040639638900757, + "learning_rate": 0.0002, + "loss": 1.3497, + "step": 8010 + }, + { + "epoch": 5.991781845349272, + "grad_norm": 0.9596554040908813, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 8020 + }, + { + "epoch": 5.999252895031752, + "grad_norm": 0.9782963991165161, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 8030 + }, + { + "epoch": 6.0, + "eval_loss": 2.0417845249176025, + "eval_runtime": 38.8465, + "eval_samples_per_second": 13.257, + "eval_steps_per_second": 1.673, + "step": 8031 + }, + { + "epoch": 6.006723944714232, + "grad_norm": 1.380823016166687, + "learning_rate": 0.0002, + "loss": 1.0886, + "step": 8040 + }, + { + "epoch": 6.014194994396712, + "grad_norm": 1.067636251449585, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 8050 + }, + { + "epoch": 6.021666044079193, + "grad_norm": 1.363402009010315, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 8060 + }, + { + "epoch": 6.0291370937616735, + "grad_norm": 0.9901054501533508, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 8070 + }, + { + "epoch": 6.036608143444154, + "grad_norm": 1.1545379161834717, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 8080 + }, + { + "epoch": 6.044079193126635, + "grad_norm": 1.2259265184402466, + "learning_rate": 0.0002, + "loss": 1.0644, + "step": 8090 + }, + { + "epoch": 6.051550242809115, + "grad_norm": 1.1237425804138184, + "learning_rate": 0.0002, + "loss": 1.1273, + "step": 8100 + }, + { + "epoch": 6.059021292491595, + "grad_norm": 1.2805622816085815, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 8110 + }, + { + "epoch": 6.066492342174075, + "grad_norm": 1.2270452976226807, + "learning_rate": 0.0002, + "loss": 1.0731, + "step": 8120 + }, + { + "epoch": 6.073963391856556, + "grad_norm": 1.1924101114273071, + "learning_rate": 0.0002, + "loss": 1.0692, + "step": 8130 + }, + { + "epoch": 6.081434441539036, + "grad_norm": 1.2543894052505493, + "learning_rate": 0.0002, + "loss": 1.1698, + "step": 8140 + }, + { + "epoch": 6.088905491221516, + "grad_norm": 1.1821149587631226, + "learning_rate": 0.0002, + "loss": 1.069, + "step": 8150 + }, + { + "epoch": 6.096376540903997, + "grad_norm": 1.2202836275100708, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 8160 + }, + { + "epoch": 6.1038475905864775, + "grad_norm": 1.0576019287109375, + "learning_rate": 0.0002, + "loss": 1.136, + "step": 8170 + }, + { + "epoch": 6.111318640268958, + "grad_norm": 1.31708824634552, + "learning_rate": 0.0002, + "loss": 1.1395, + "step": 8180 + }, + { + "epoch": 6.118789689951438, + "grad_norm": 1.0479495525360107, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 8190 + }, + { + "epoch": 6.126260739633919, + "grad_norm": 1.285003423690796, + "learning_rate": 0.0002, + "loss": 1.0764, + "step": 8200 + }, + { + "epoch": 6.133731789316399, + "grad_norm": 1.0989165306091309, + "learning_rate": 0.0002, + "loss": 1.0642, + "step": 8210 + }, + { + "epoch": 6.141202838998879, + "grad_norm": 1.1659013032913208, + "learning_rate": 0.0002, + "loss": 1.0981, + "step": 8220 + }, + { + "epoch": 6.14867388868136, + "grad_norm": 1.2796376943588257, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 8230 + }, + { + "epoch": 6.15614493836384, + "grad_norm": 1.060564637184143, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 8240 + }, + { + "epoch": 6.16361598804632, + "grad_norm": 1.3884605169296265, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 8250 + }, + { + "epoch": 6.1710870377288005, + "grad_norm": 1.1570569276809692, + "learning_rate": 0.0002, + "loss": 1.0504, + "step": 8260 + }, + { + "epoch": 6.1785580874112815, + "grad_norm": 1.4136502742767334, + "learning_rate": 0.0002, + "loss": 1.0386, + "step": 8270 + }, + { + "epoch": 6.186029137093762, + "grad_norm": 1.3396095037460327, + "learning_rate": 0.0002, + "loss": 1.0882, + "step": 8280 + }, + { + "epoch": 6.193500186776242, + "grad_norm": 1.2549997568130493, + "learning_rate": 0.0002, + "loss": 1.133, + "step": 8290 + }, + { + "epoch": 6.200971236458723, + "grad_norm": 1.3629751205444336, + "learning_rate": 0.0002, + "loss": 1.0626, + "step": 8300 + }, + { + "epoch": 6.208442286141203, + "grad_norm": 1.1029163599014282, + "learning_rate": 0.0002, + "loss": 1.1343, + "step": 8310 + }, + { + "epoch": 6.215913335823683, + "grad_norm": 1.1992450952529907, + "learning_rate": 0.0002, + "loss": 1.0895, + "step": 8320 + }, + { + "epoch": 6.223384385506163, + "grad_norm": 1.3317986726760864, + "learning_rate": 0.0002, + "loss": 1.1417, + "step": 8330 + }, + { + "epoch": 6.230855435188644, + "grad_norm": 1.0538336038589478, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 8340 + }, + { + "epoch": 6.238326484871124, + "grad_norm": 1.1767704486846924, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 8350 + }, + { + "epoch": 6.2457975345536045, + "grad_norm": 1.1213016510009766, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 8360 + }, + { + "epoch": 6.253268584236086, + "grad_norm": 1.1895716190338135, + "learning_rate": 0.0002, + "loss": 1.1241, + "step": 8370 + }, + { + "epoch": 6.260739633918566, + "grad_norm": 1.1078153848648071, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 8380 + }, + { + "epoch": 6.268210683601046, + "grad_norm": 1.1662801504135132, + "learning_rate": 0.0002, + "loss": 1.1124, + "step": 8390 + }, + { + "epoch": 6.275681733283526, + "grad_norm": 1.2071197032928467, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 8400 + }, + { + "epoch": 6.283152782966007, + "grad_norm": 1.2653778791427612, + "learning_rate": 0.0002, + "loss": 1.0625, + "step": 8410 + }, + { + "epoch": 6.290623832648487, + "grad_norm": 1.6128872632980347, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 8420 + }, + { + "epoch": 6.298094882330967, + "grad_norm": 1.4993070363998413, + "learning_rate": 0.0002, + "loss": 1.1212, + "step": 8430 + }, + { + "epoch": 6.305565932013448, + "grad_norm": 1.16339910030365, + "learning_rate": 0.0002, + "loss": 1.1516, + "step": 8440 + }, + { + "epoch": 6.313036981695928, + "grad_norm": 1.256822943687439, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 8450 + }, + { + "epoch": 6.3205080313784086, + "grad_norm": 1.1352964639663696, + "learning_rate": 0.0002, + "loss": 1.1566, + "step": 8460 + }, + { + "epoch": 6.327979081060889, + "grad_norm": 1.0061070919036865, + "learning_rate": 0.0002, + "loss": 1.1297, + "step": 8470 + }, + { + "epoch": 6.33545013074337, + "grad_norm": 1.1901768445968628, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 8480 + }, + { + "epoch": 6.34292118042585, + "grad_norm": 1.2715139389038086, + "learning_rate": 0.0002, + "loss": 1.1463, + "step": 8490 + }, + { + "epoch": 6.35039223010833, + "grad_norm": 1.1583346128463745, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 8500 + }, + { + "epoch": 6.357863279790811, + "grad_norm": 1.1427477598190308, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 8510 + }, + { + "epoch": 6.365334329473291, + "grad_norm": 1.1952263116836548, + "learning_rate": 0.0002, + "loss": 1.1119, + "step": 8520 + }, + { + "epoch": 6.372805379155771, + "grad_norm": 1.0599623918533325, + "learning_rate": 0.0002, + "loss": 1.0797, + "step": 8530 + }, + { + "epoch": 6.380276428838251, + "grad_norm": 1.3511574268341064, + "learning_rate": 0.0002, + "loss": 1.1091, + "step": 8540 + }, + { + "epoch": 6.387747478520732, + "grad_norm": 1.171126127243042, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 8550 + }, + { + "epoch": 6.395218528203213, + "grad_norm": 1.285474419593811, + "learning_rate": 0.0002, + "loss": 1.1615, + "step": 8560 + }, + { + "epoch": 6.402689577885693, + "grad_norm": 0.9751279950141907, + "learning_rate": 0.0002, + "loss": 1.1505, + "step": 8570 + }, + { + "epoch": 6.410160627568174, + "grad_norm": 1.2194149494171143, + "learning_rate": 0.0002, + "loss": 1.1502, + "step": 8580 + }, + { + "epoch": 6.417631677250654, + "grad_norm": 1.255888819694519, + "learning_rate": 0.0002, + "loss": 1.138, + "step": 8590 + }, + { + "epoch": 6.425102726933134, + "grad_norm": 1.1636122465133667, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 8600 + }, + { + "epoch": 6.432573776615614, + "grad_norm": 1.0769859552383423, + "learning_rate": 0.0002, + "loss": 1.1398, + "step": 8610 + }, + { + "epoch": 6.440044826298095, + "grad_norm": 1.151778221130371, + "learning_rate": 0.0002, + "loss": 1.1183, + "step": 8620 + }, + { + "epoch": 6.447515875980575, + "grad_norm": 1.2749944925308228, + "learning_rate": 0.0002, + "loss": 1.0706, + "step": 8630 + }, + { + "epoch": 6.454986925663055, + "grad_norm": 1.1925828456878662, + "learning_rate": 0.0002, + "loss": 1.1011, + "step": 8640 + }, + { + "epoch": 6.4624579753455365, + "grad_norm": 1.166107416152954, + "learning_rate": 0.0002, + "loss": 1.1581, + "step": 8650 + }, + { + "epoch": 6.469929025028017, + "grad_norm": 1.0372248888015747, + "learning_rate": 0.0002, + "loss": 1.105, + "step": 8660 + }, + { + "epoch": 6.477400074710497, + "grad_norm": 1.26933753490448, + "learning_rate": 0.0002, + "loss": 1.1546, + "step": 8670 + }, + { + "epoch": 6.484871124392977, + "grad_norm": 1.2154223918914795, + "learning_rate": 0.0002, + "loss": 1.2362, + "step": 8680 + }, + { + "epoch": 6.492342174075458, + "grad_norm": 1.09475839138031, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 8690 + }, + { + "epoch": 6.499813223757938, + "grad_norm": 1.0763037204742432, + "learning_rate": 0.0002, + "loss": 1.1168, + "step": 8700 + }, + { + "epoch": 6.507284273440418, + "grad_norm": 1.1882896423339844, + "learning_rate": 0.0002, + "loss": 1.1993, + "step": 8710 + }, + { + "epoch": 6.514755323122898, + "grad_norm": 1.1662089824676514, + "learning_rate": 0.0002, + "loss": 1.1498, + "step": 8720 + }, + { + "epoch": 6.522226372805379, + "grad_norm": 1.3259495496749878, + "learning_rate": 0.0002, + "loss": 1.2008, + "step": 8730 + }, + { + "epoch": 6.5296974224878594, + "grad_norm": 1.0858017206192017, + "learning_rate": 0.0002, + "loss": 1.1289, + "step": 8740 + }, + { + "epoch": 6.53716847217034, + "grad_norm": 1.240337610244751, + "learning_rate": 0.0002, + "loss": 1.1335, + "step": 8750 + }, + { + "epoch": 6.544639521852821, + "grad_norm": 1.1381462812423706, + "learning_rate": 0.0002, + "loss": 1.1479, + "step": 8760 + }, + { + "epoch": 6.552110571535301, + "grad_norm": 1.2220063209533691, + "learning_rate": 0.0002, + "loss": 1.0991, + "step": 8770 + }, + { + "epoch": 6.559581621217781, + "grad_norm": 1.1553083658218384, + "learning_rate": 0.0002, + "loss": 1.159, + "step": 8780 + }, + { + "epoch": 6.567052670900262, + "grad_norm": 1.1383219957351685, + "learning_rate": 0.0002, + "loss": 1.0996, + "step": 8790 + }, + { + "epoch": 6.574523720582742, + "grad_norm": 1.0379676818847656, + "learning_rate": 0.0002, + "loss": 1.1355, + "step": 8800 + }, + { + "epoch": 6.581994770265222, + "grad_norm": 1.376488447189331, + "learning_rate": 0.0002, + "loss": 1.1704, + "step": 8810 + }, + { + "epoch": 6.589465819947702, + "grad_norm": 1.1586211919784546, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 8820 + }, + { + "epoch": 6.596936869630183, + "grad_norm": 1.28152334690094, + "learning_rate": 0.0002, + "loss": 1.1904, + "step": 8830 + }, + { + "epoch": 6.6044079193126635, + "grad_norm": 1.2656810283660889, + "learning_rate": 0.0002, + "loss": 1.1646, + "step": 8840 + }, + { + "epoch": 6.611878968995144, + "grad_norm": 1.0636502504348755, + "learning_rate": 0.0002, + "loss": 1.1865, + "step": 8850 + }, + { + "epoch": 6.619350018677624, + "grad_norm": 1.273239254951477, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 8860 + }, + { + "epoch": 6.626821068360105, + "grad_norm": 1.1055482625961304, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 8870 + }, + { + "epoch": 6.634292118042585, + "grad_norm": 1.1934176683425903, + "learning_rate": 0.0002, + "loss": 1.0877, + "step": 8880 + }, + { + "epoch": 6.641763167725065, + "grad_norm": 1.2248114347457886, + "learning_rate": 0.0002, + "loss": 1.194, + "step": 8890 + }, + { + "epoch": 6.649234217407546, + "grad_norm": 1.1950982809066772, + "learning_rate": 0.0002, + "loss": 1.1609, + "step": 8900 + }, + { + "epoch": 6.656705267090026, + "grad_norm": 1.0821784734725952, + "learning_rate": 0.0002, + "loss": 1.169, + "step": 8910 + }, + { + "epoch": 6.664176316772506, + "grad_norm": 1.0062463283538818, + "learning_rate": 0.0002, + "loss": 1.1337, + "step": 8920 + }, + { + "epoch": 6.671647366454987, + "grad_norm": 1.2373089790344238, + "learning_rate": 0.0002, + "loss": 1.1403, + "step": 8930 + }, + { + "epoch": 6.6791184161374675, + "grad_norm": 1.1821746826171875, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 8940 + }, + { + "epoch": 6.686589465819948, + "grad_norm": 1.2350659370422363, + "learning_rate": 0.0002, + "loss": 1.1214, + "step": 8950 + }, + { + "epoch": 6.694060515502428, + "grad_norm": 1.1012883186340332, + "learning_rate": 0.0002, + "loss": 1.225, + "step": 8960 + }, + { + "epoch": 6.701531565184909, + "grad_norm": 1.2008943557739258, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 8970 + }, + { + "epoch": 6.709002614867389, + "grad_norm": 1.2355504035949707, + "learning_rate": 0.0002, + "loss": 1.1769, + "step": 8980 + }, + { + "epoch": 6.716473664549869, + "grad_norm": 1.2367502450942993, + "learning_rate": 0.0002, + "loss": 1.1323, + "step": 8990 + }, + { + "epoch": 6.723944714232349, + "grad_norm": 1.1075866222381592, + "learning_rate": 0.0002, + "loss": 1.1235, + "step": 9000 + }, + { + "epoch": 6.73141576391483, + "grad_norm": 1.246480941772461, + "learning_rate": 0.0002, + "loss": 1.1239, + "step": 9010 + }, + { + "epoch": 6.73888681359731, + "grad_norm": 1.1252824068069458, + "learning_rate": 0.0002, + "loss": 1.2154, + "step": 9020 + }, + { + "epoch": 6.7463578632797905, + "grad_norm": 1.0706887245178223, + "learning_rate": 0.0002, + "loss": 1.1762, + "step": 9030 + }, + { + "epoch": 6.7538289129622715, + "grad_norm": 1.0874755382537842, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 9040 + }, + { + "epoch": 6.761299962644752, + "grad_norm": 1.121434211730957, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 9050 + }, + { + "epoch": 6.768771012327232, + "grad_norm": 1.1517996788024902, + "learning_rate": 0.0002, + "loss": 1.2018, + "step": 9060 + }, + { + "epoch": 6.776242062009713, + "grad_norm": 1.2484540939331055, + "learning_rate": 0.0002, + "loss": 1.1593, + "step": 9070 + }, + { + "epoch": 6.783713111692193, + "grad_norm": 1.023059368133545, + "learning_rate": 0.0002, + "loss": 1.13, + "step": 9080 + }, + { + "epoch": 6.791184161374673, + "grad_norm": 1.1334631443023682, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 9090 + }, + { + "epoch": 6.798655211057153, + "grad_norm": 1.2991816997528076, + "learning_rate": 0.0002, + "loss": 1.18, + "step": 9100 + }, + { + "epoch": 6.806126260739634, + "grad_norm": 1.4147199392318726, + "learning_rate": 0.0002, + "loss": 1.2398, + "step": 9110 + }, + { + "epoch": 6.813597310422114, + "grad_norm": 1.1353832483291626, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 9120 + }, + { + "epoch": 6.8210683601045945, + "grad_norm": 1.0332539081573486, + "learning_rate": 0.0002, + "loss": 1.1379, + "step": 9130 + }, + { + "epoch": 6.828539409787075, + "grad_norm": 1.2208142280578613, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 9140 + }, + { + "epoch": 6.836010459469556, + "grad_norm": 1.3033398389816284, + "learning_rate": 0.0002, + "loss": 1.1463, + "step": 9150 + }, + { + "epoch": 6.843481509152036, + "grad_norm": 1.2676737308502197, + "learning_rate": 0.0002, + "loss": 1.1834, + "step": 9160 + }, + { + "epoch": 6.850952558834516, + "grad_norm": 1.1668603420257568, + "learning_rate": 0.0002, + "loss": 1.1786, + "step": 9170 + }, + { + "epoch": 6.858423608516997, + "grad_norm": 1.1994788646697998, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 9180 + }, + { + "epoch": 6.865894658199477, + "grad_norm": 1.231873869895935, + "learning_rate": 0.0002, + "loss": 1.2131, + "step": 9190 + }, + { + "epoch": 6.873365707881957, + "grad_norm": 0.9981484413146973, + "learning_rate": 0.0002, + "loss": 1.2109, + "step": 9200 + }, + { + "epoch": 6.880836757564438, + "grad_norm": 1.2799428701400757, + "learning_rate": 0.0002, + "loss": 1.1084, + "step": 9210 + }, + { + "epoch": 6.888307807246918, + "grad_norm": 1.2042057514190674, + "learning_rate": 0.0002, + "loss": 1.2004, + "step": 9220 + }, + { + "epoch": 6.8957788569293985, + "grad_norm": 1.070420265197754, + "learning_rate": 0.0002, + "loss": 1.1567, + "step": 9230 + }, + { + "epoch": 6.903249906611879, + "grad_norm": 1.327160358428955, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 9240 + }, + { + "epoch": 6.91072095629436, + "grad_norm": 1.1109007596969604, + "learning_rate": 0.0002, + "loss": 1.1945, + "step": 9250 + }, + { + "epoch": 6.91819200597684, + "grad_norm": 1.1669930219650269, + "learning_rate": 0.0002, + "loss": 1.1701, + "step": 9260 + }, + { + "epoch": 6.92566305565932, + "grad_norm": 1.034532904624939, + "learning_rate": 0.0002, + "loss": 1.1854, + "step": 9270 + }, + { + "epoch": 6.9331341053418, + "grad_norm": 1.1035540103912354, + "learning_rate": 0.0002, + "loss": 1.1712, + "step": 9280 + }, + { + "epoch": 6.940605155024281, + "grad_norm": 1.366254448890686, + "learning_rate": 0.0002, + "loss": 1.1767, + "step": 9290 + }, + { + "epoch": 6.948076204706761, + "grad_norm": 1.094214677810669, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 9300 + }, + { + "epoch": 6.955547254389241, + "grad_norm": 1.131238579750061, + "learning_rate": 0.0002, + "loss": 1.18, + "step": 9310 + }, + { + "epoch": 6.963018304071722, + "grad_norm": 1.202369213104248, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 9320 + }, + { + "epoch": 6.9704893537542025, + "grad_norm": 1.1067225933074951, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 9330 + }, + { + "epoch": 6.977960403436683, + "grad_norm": 1.0258643627166748, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 9340 + }, + { + "epoch": 6.985431453119164, + "grad_norm": 1.3311655521392822, + "learning_rate": 0.0002, + "loss": 1.2053, + "step": 9350 + }, + { + "epoch": 6.992902502801644, + "grad_norm": 1.1245559453964233, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 9360 + }, + { + "epoch": 6.999626447515876, + "eval_loss": 2.128103017807007, + "eval_runtime": 39.1339, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.661, + "step": 9369 + } + ], + "logging_steps": 10, + "max_steps": 10704, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.335994808397988e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-9369/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..67c7b4ca126d7b712ad1985ef15ffb29ebe76633 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc87da605e94ff0ecd8f5b371302a2d5f8727b77984707a2185ddb447fc3796 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8494c24717c1201780732d97f3f6759d9500a9ae --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9996264475158759, "step": 1338, "epoch_duration": 1438.9762296676636, "total_accumulated_duration": 1438.9762296676636, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6181, "grad_norm": 0.4912872612476349, "learning_rate": 0.0002, "epoch": 0.007471049682480389, "step": 10}, {"loss": 2.2606, "grad_norm": 0.4856316149234772, "learning_rate": 0.0002, "epoch": 0.014942099364960777, "step": 20}, {"loss": 2.0957, "grad_norm": 0.47683125734329224, "learning_rate": 0.0002, "epoch": 0.022413149047441166, "step": 30}, {"loss": 1.8908, "grad_norm": 0.515082597732544, "learning_rate": 0.0002, "epoch": 0.029884198729921554, "step": 40}, {"loss": 1.9704, "grad_norm": 0.5299215316772461, "learning_rate": 0.0002, "epoch": 0.03735524841240194, "step": 50}, {"loss": 1.9225, "grad_norm": 0.4951399862766266, "learning_rate": 0.0002, "epoch": 0.04482629809488233, "step": 60}, {"loss": 1.9742, "grad_norm": 0.48079821467399597, "learning_rate": 0.0002, "epoch": 0.05229734777736272, "step": 70}, {"loss": 1.9466, "grad_norm": 0.49402132630348206, "learning_rate": 0.0002, "epoch": 0.05976839745984311, "step": 80}, {"loss": 1.8691, "grad_norm": 0.4778193235397339, "learning_rate": 0.0002, "epoch": 0.0672394471423235, "step": 90}, {"loss": 1.8455, "grad_norm": 0.42472657561302185, "learning_rate": 0.0002, "epoch": 0.07471049682480388, "step": 100}, {"loss": 1.8744, "grad_norm": 0.4433092474937439, "learning_rate": 0.0002, "epoch": 0.08218154650728428, "step": 110}, {"loss": 1.865, "grad_norm": 0.4472862780094147, "learning_rate": 0.0002, "epoch": 0.08965259618976466, "step": 120}, {"loss": 1.9256, "grad_norm": 0.42596298456192017, "learning_rate": 0.0002, "epoch": 0.09712364587224505, "step": 130}, {"loss": 1.8015, "grad_norm": 0.46645811200141907, "learning_rate": 0.0002, "epoch": 0.10459469555472543, "step": 140}, {"loss": 1.8307, "grad_norm": 0.41041234135627747, "learning_rate": 0.0002, "epoch": 0.11206574523720583, "step": 150}, {"loss": 1.8276, "grad_norm": 0.5329819917678833, "learning_rate": 0.0002, "epoch": 0.11953679491968622, "step": 160}, {"loss": 1.8118, "grad_norm": 0.4065922200679779, "learning_rate": 0.0002, "epoch": 0.1270078446021666, "step": 170}, {"loss": 1.8559, "grad_norm": 0.38406994938850403, "learning_rate": 0.0002, "epoch": 0.134478894284647, "step": 180}, {"loss": 1.8647, "grad_norm": 0.4246881306171417, "learning_rate": 0.0002, "epoch": 0.14194994396712737, "step": 190}, {"loss": 1.8054, "grad_norm": 0.35136649012565613, "learning_rate": 0.0002, "epoch": 0.14942099364960776, "step": 200}, {"loss": 1.802, "grad_norm": 0.43252742290496826, "learning_rate": 0.0002, "epoch": 0.15689204333208817, "step": 210}, {"loss": 1.7823, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002, "epoch": 0.16436309301456856, "step": 220}, {"loss": 1.818, "grad_norm": 0.3748249113559723, "learning_rate": 0.0002, "epoch": 0.17183414269704894, "step": 230}, {"loss": 1.866, "grad_norm": 0.6432855725288391, "learning_rate": 0.0002, "epoch": 0.17930519237952933, "step": 240}, {"loss": 1.8397, "grad_norm": 0.34874802827835083, "learning_rate": 0.0002, "epoch": 0.1867762420620097, "step": 250}, {"loss": 1.79, "grad_norm": 0.3721984326839447, "learning_rate": 0.0002, "epoch": 0.1942472917444901, "step": 260}, {"loss": 1.8464, "grad_norm": 0.4339311420917511, "learning_rate": 0.0002, "epoch": 0.20171834142697048, "step": 270}, {"loss": 1.8665, "grad_norm": 0.4018215537071228, "learning_rate": 0.0002, "epoch": 0.20918939110945087, "step": 280}, {"loss": 1.8048, "grad_norm": 0.3278839886188507, "learning_rate": 0.0002, "epoch": 0.21666044079193125, "step": 290}, {"loss": 1.7395, "grad_norm": 0.36146077513694763, "learning_rate": 0.0002, "epoch": 0.22413149047441167, "step": 300}, {"loss": 1.7916, "grad_norm": 0.38175010681152344, "learning_rate": 0.0002, "epoch": 0.23160254015689205, "step": 310}, {"loss": 1.8593, "grad_norm": 0.44776618480682373, "learning_rate": 0.0002, "epoch": 0.23907358983937244, "step": 320}, {"loss": 1.7824, "grad_norm": 0.3933652937412262, "learning_rate": 0.0002, "epoch": 0.24654463952185282, "step": 330}, {"loss": 1.8393, "grad_norm": 0.3515005111694336, "learning_rate": 0.0002, "epoch": 0.2540156892043332, "step": 340}, {"loss": 1.8653, "grad_norm": 0.6683304309844971, "learning_rate": 0.0002, "epoch": 0.2614867388868136, "step": 350}, {"loss": 1.8797, "grad_norm": 0.37093454599380493, "learning_rate": 0.0002, "epoch": 0.268957788569294, "step": 360}, {"loss": 1.8251, "grad_norm": 0.3450651168823242, "learning_rate": 0.0002, "epoch": 0.2764288382517744, "step": 370}, {"loss": 1.7435, "grad_norm": 0.5140917301177979, "learning_rate": 0.0002, "epoch": 0.28389988793425475, "step": 380}, {"loss": 1.8026, "grad_norm": 0.32885563373565674, "learning_rate": 0.0002, "epoch": 0.29137093761673516, "step": 390}, {"loss": 1.8174, "grad_norm": 0.33962297439575195, "learning_rate": 0.0002, "epoch": 0.2988419872992155, "step": 400}, {"loss": 1.7467, "grad_norm": 0.3723141849040985, "learning_rate": 0.0002, "epoch": 0.30631303698169593, "step": 410}, {"loss": 1.8459, "grad_norm": 0.37173134088516235, "learning_rate": 0.0002, "epoch": 0.31378408666417634, "step": 420}, {"loss": 1.8876, "grad_norm": 0.33736956119537354, "learning_rate": 0.0002, "epoch": 0.3212551363466567, "step": 430}, {"loss": 1.8367, "grad_norm": 0.3602448105812073, "learning_rate": 0.0002, "epoch": 0.3287261860291371, "step": 440}, {"loss": 1.8058, "grad_norm": 0.3569699227809906, "learning_rate": 0.0002, "epoch": 0.33619723571161747, "step": 450}, {"loss": 1.8086, "grad_norm": 0.31009167432785034, "learning_rate": 0.0002, "epoch": 0.3436682853940979, "step": 460}, {"loss": 1.8876, "grad_norm": 0.5278693437576294, "learning_rate": 0.0002, "epoch": 0.35113933507657824, "step": 470}, {"loss": 1.8534, "grad_norm": 0.3587537109851837, "learning_rate": 0.0002, "epoch": 0.35861038475905865, "step": 480}, {"loss": 1.8046, "grad_norm": 0.3859670162200928, "learning_rate": 0.0002, "epoch": 0.366081434441539, "step": 490}, {"loss": 1.8287, "grad_norm": 0.395913690328598, "learning_rate": 0.0002, "epoch": 0.3735524841240194, "step": 500}, {"loss": 1.7619, "grad_norm": 0.35052940249443054, "learning_rate": 0.0002, "epoch": 0.38102353380649984, "step": 510}, {"loss": 1.7824, "grad_norm": 0.2979494333267212, "learning_rate": 0.0002, "epoch": 0.3884945834889802, "step": 520}, {"loss": 1.8641, "grad_norm": 0.3062683343887329, "learning_rate": 0.0002, "epoch": 0.3959656331714606, "step": 530}, {"loss": 1.7651, "grad_norm": 0.3172847330570221, "learning_rate": 0.0002, "epoch": 0.40343668285394096, "step": 540}, {"loss": 1.806, "grad_norm": 0.360435426235199, "learning_rate": 0.0002, "epoch": 0.4109077325364214, "step": 550}, {"loss": 1.9054, "grad_norm": 0.3427872359752655, "learning_rate": 0.0002, "epoch": 0.41837878221890173, "step": 560}, {"loss": 1.7562, "grad_norm": 0.34036558866500854, "learning_rate": 0.0002, "epoch": 0.42584983190138215, "step": 570}, {"loss": 1.7254, "grad_norm": 0.3365345299243927, "learning_rate": 0.0002, "epoch": 0.4333208815838625, "step": 580}, {"loss": 1.8328, "grad_norm": 0.35619041323661804, "learning_rate": 0.0002, "epoch": 0.4407919312663429, "step": 590}, {"loss": 1.8114, "grad_norm": 0.3569088280200958, "learning_rate": 0.0002, "epoch": 0.44826298094882333, "step": 600}, {"loss": 1.8599, "grad_norm": 0.3581278622150421, "learning_rate": 0.0002, "epoch": 0.4557340306313037, "step": 610}, {"loss": 1.7078, "grad_norm": 0.43197110295295715, "learning_rate": 0.0002, "epoch": 0.4632050803137841, "step": 620}, {"loss": 1.8257, "grad_norm": 0.33966198563575745, "learning_rate": 0.0002, "epoch": 0.47067612999626446, "step": 630}, {"loss": 1.7528, "grad_norm": 0.3343866467475891, "learning_rate": 0.0002, "epoch": 0.47814717967874487, "step": 640}, {"loss": 1.8191, "grad_norm": 0.33878564834594727, "learning_rate": 0.0002, "epoch": 0.48561822936122523, "step": 650}, {"loss": 1.8801, "grad_norm": 0.387195885181427, "learning_rate": 0.0002, "epoch": 0.49308927904370564, "step": 660}, {"loss": 1.7559, "grad_norm": 0.3755440413951874, "learning_rate": 0.0002, "epoch": 0.500560328726186, "step": 670}, {"loss": 1.8057, "grad_norm": 0.3272816836833954, "learning_rate": 0.0002, "epoch": 0.5080313784086664, "step": 680}, {"loss": 1.8156, "grad_norm": 0.36063864827156067, "learning_rate": 0.0002, "epoch": 0.5155024280911468, "step": 690}, {"loss": 1.8397, "grad_norm": 0.35317373275756836, "learning_rate": 0.0002, "epoch": 0.5229734777736272, "step": 700}, {"loss": 1.7603, "grad_norm": 0.3561195433139801, "learning_rate": 0.0002, "epoch": 0.5304445274561076, "step": 710}, {"loss": 1.8149, "grad_norm": 0.31124624609947205, "learning_rate": 0.0002, "epoch": 0.537915577138588, "step": 720}, {"loss": 1.7434, "grad_norm": 0.3294544517993927, "learning_rate": 0.0002, "epoch": 0.5453866268210683, "step": 730}, {"loss": 1.8027, "grad_norm": 0.31933900713920593, "learning_rate": 0.0002, "epoch": 0.5528576765035488, "step": 740}, {"loss": 1.7601, "grad_norm": 0.3226020634174347, "learning_rate": 0.0002, "epoch": 0.5603287261860291, "step": 750}, {"loss": 1.7862, "grad_norm": 0.3147525489330292, "learning_rate": 0.0002, "epoch": 0.5677997758685095, "step": 760}, {"loss": 1.9028, "grad_norm": 0.32234328985214233, "learning_rate": 0.0002, "epoch": 0.57527082555099, "step": 770}, {"loss": 1.7623, "grad_norm": 0.3258664309978485, "learning_rate": 0.0002, "epoch": 0.5827418752334703, "step": 780}, {"loss": 1.7384, "grad_norm": 0.3166961967945099, "learning_rate": 0.0002, "epoch": 0.5902129249159507, "step": 790}, {"loss": 1.8799, "grad_norm": 0.35621458292007446, "learning_rate": 0.0002, "epoch": 0.597683974598431, "step": 800}, {"loss": 1.8313, "grad_norm": 0.3236999213695526, "learning_rate": 0.0002, "epoch": 0.6051550242809115, "step": 810}, {"loss": 1.7132, "grad_norm": 0.2892923653125763, "learning_rate": 0.0002, "epoch": 0.6126260739633919, "step": 820}, {"loss": 1.8709, "grad_norm": 0.4098321497440338, "learning_rate": 0.0002, "epoch": 0.6200971236458722, "step": 830}, {"loss": 1.7637, "grad_norm": 0.3337118923664093, "learning_rate": 0.0002, "epoch": 0.6275681733283527, "step": 840}, {"loss": 1.7375, "grad_norm": 0.30416029691696167, "learning_rate": 0.0002, "epoch": 0.635039223010833, "step": 850}, {"loss": 1.7419, "grad_norm": 0.3361026346683502, "learning_rate": 0.0002, "epoch": 0.6425102726933134, "step": 860}, {"loss": 1.732, "grad_norm": 0.3537365198135376, "learning_rate": 0.0002, "epoch": 0.6499813223757938, "step": 870}, {"loss": 1.7825, "grad_norm": 0.33854469656944275, "learning_rate": 0.0002, "epoch": 0.6574523720582742, "step": 880}, {"loss": 1.7561, "grad_norm": 0.3332272469997406, "learning_rate": 0.0002, "epoch": 0.6649234217407546, "step": 890}, {"loss": 1.7247, "grad_norm": 0.34954726696014404, "learning_rate": 0.0002, "epoch": 0.6723944714232349, "step": 900}, {"loss": 1.7917, "grad_norm": 0.2921750247478485, "learning_rate": 0.0002, "epoch": 0.6798655211057153, "step": 910}, {"loss": 1.7807, "grad_norm": 0.30508682131767273, "learning_rate": 0.0002, "epoch": 0.6873365707881958, "step": 920}, {"loss": 1.8082, "grad_norm": 0.32268425822257996, "learning_rate": 0.0002, "epoch": 0.6948076204706761, "step": 930}, {"loss": 1.8283, "grad_norm": 0.2844390869140625, "learning_rate": 0.0002, "epoch": 0.7022786701531565, "step": 940}, {"loss": 1.7363, "grad_norm": 0.31263890862464905, "learning_rate": 0.0002, "epoch": 0.709749719835637, "step": 950}, {"loss": 1.8081, "grad_norm": 0.3626808822154999, "learning_rate": 0.0002, "epoch": 0.7172207695181173, "step": 960}, {"loss": 1.853, "grad_norm": 0.3322749733924866, "learning_rate": 0.0002, "epoch": 0.7246918192005977, "step": 970}, {"loss": 1.7912, "grad_norm": 0.29177871346473694, "learning_rate": 0.0002, "epoch": 0.732162868883078, "step": 980}, {"loss": 1.8447, "grad_norm": 0.35405513644218445, "learning_rate": 0.0002, "epoch": 0.7396339185655585, "step": 990}, {"loss": 1.7008, "grad_norm": 0.39318400621414185, "learning_rate": 0.0002, "epoch": 0.7471049682480388, "step": 1000}, {"loss": 1.7803, "grad_norm": 0.29401418566703796, "learning_rate": 0.0002, "epoch": 0.7545760179305192, "step": 1010}, {"loss": 1.7649, "grad_norm": 0.3271748721599579, "learning_rate": 0.0002, "epoch": 0.7620470676129997, "step": 1020}, {"loss": 1.7266, "grad_norm": 0.30883970856666565, "learning_rate": 0.0002, "epoch": 0.76951811729548, "step": 1030}, {"loss": 1.7722, "grad_norm": 0.3411838412284851, "learning_rate": 0.0002, "epoch": 0.7769891669779604, "step": 1040}, {"loss": 1.829, "grad_norm": 0.30608129501342773, "learning_rate": 0.0002, "epoch": 0.7844602166604407, "step": 1050}, {"loss": 1.7815, "grad_norm": 0.30899080634117126, "learning_rate": 0.0002, "epoch": 0.7919312663429212, "step": 1060}, {"loss": 1.7625, "grad_norm": 0.3160453140735626, "learning_rate": 0.0002, "epoch": 0.7994023160254016, "step": 1070}, {"loss": 1.8452, "grad_norm": 0.30947187542915344, "learning_rate": 0.0002, "epoch": 0.8068733657078819, "step": 1080}, {"loss": 1.7418, "grad_norm": 0.3103134036064148, "learning_rate": 0.0002, "epoch": 0.8143444153903624, "step": 1090}, {"loss": 1.842, "grad_norm": 0.31771138310432434, "learning_rate": 0.0002, "epoch": 0.8218154650728428, "step": 1100}, {"loss": 1.7918, "grad_norm": 0.5860997438430786, "learning_rate": 0.0002, "epoch": 0.8292865147553231, "step": 1110}, {"loss": 1.8443, "grad_norm": 0.3230148255825043, "learning_rate": 0.0002, "epoch": 0.8367575644378035, "step": 1120}, {"loss": 1.8478, "grad_norm": 0.29611510038375854, "learning_rate": 0.0002, "epoch": 0.8442286141202839, "step": 1130}, {"loss": 1.7673, "grad_norm": 0.3373654782772064, "learning_rate": 0.0002, "epoch": 0.8516996638027643, "step": 1140}, {"loss": 1.7997, "grad_norm": 0.3474279046058655, "learning_rate": 0.0002, "epoch": 0.8591707134852447, "step": 1150}, {"loss": 1.75, "grad_norm": 0.35057875514030457, "learning_rate": 0.0002, "epoch": 0.866641763167725, "step": 1160}, {"loss": 1.8273, "grad_norm": 0.39537495374679565, "learning_rate": 0.0002, "epoch": 0.8741128128502055, "step": 1170}, {"loss": 1.7682, "grad_norm": 0.3714233636856079, "learning_rate": 0.0002, "epoch": 0.8815838625326858, "step": 1180}, {"loss": 1.7549, "grad_norm": 0.2950296998023987, "learning_rate": 0.0002, "epoch": 0.8890549122151662, "step": 1190}, {"loss": 1.7612, "grad_norm": 0.38182979822158813, "learning_rate": 0.0002, "epoch": 0.8965259618976467, "step": 1200}, {"loss": 1.827, "grad_norm": 0.27883678674697876, "learning_rate": 0.0002, "epoch": 0.903997011580127, "step": 1210}, {"loss": 1.7623, "grad_norm": 0.33874374628067017, "learning_rate": 0.0002, "epoch": 0.9114680612626074, "step": 1220}, {"loss": 1.7334, "grad_norm": 0.3014272153377533, "learning_rate": 0.0002, "epoch": 0.9189391109450877, "step": 1230}, {"loss": 1.8235, "grad_norm": 0.3194271922111511, "learning_rate": 0.0002, "epoch": 0.9264101606275682, "step": 1240}, {"loss": 1.7924, "grad_norm": 0.3049403429031372, "learning_rate": 0.0002, "epoch": 0.9338812103100486, "step": 1250}, {"loss": 1.7535, "grad_norm": 0.30621254444122314, "learning_rate": 0.0002, "epoch": 0.9413522599925289, "step": 1260}, {"loss": 1.8287, "grad_norm": 0.28675132989883423, "learning_rate": 0.0002, "epoch": 0.9488233096750094, "step": 1270}, {"loss": 1.7586, "grad_norm": 0.3322032690048218, "learning_rate": 0.0002, "epoch": 0.9562943593574897, "step": 1280}, {"loss": 1.8054, "grad_norm": 0.35408294200897217, "learning_rate": 0.0002, "epoch": 0.9637654090399701, "step": 1290}, {"loss": 1.7343, "grad_norm": 0.36386919021606445, "learning_rate": 0.0002, "epoch": 0.9712364587224505, "step": 1300}, {"loss": 1.8633, "grad_norm": 0.32338324189186096, "learning_rate": 0.0002, "epoch": 0.9787075084049309, "step": 1310}, {"loss": 1.7724, "grad_norm": 0.3714013993740082, "learning_rate": 0.0002, "epoch": 0.9861785580874113, "step": 1320}, {"loss": 1.7766, "grad_norm": 0.3133082389831543, "learning_rate": 0.0002, "epoch": 0.9936496077698916, "step": 1330}]} +{"epoch": 2.0, "step": 2677, "epoch_duration": 1435.5856530666351, "total_accumulated_duration": 2874.5618827342987, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-1338", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6181, "grad_norm": 0.4912872612476349, "learning_rate": 0.0002, "epoch": 0.007471049682480389, "step": 10}, {"loss": 2.2606, "grad_norm": 0.4856316149234772, "learning_rate": 0.0002, "epoch": 0.014942099364960777, "step": 20}, {"loss": 2.0957, "grad_norm": 0.47683125734329224, "learning_rate": 0.0002, "epoch": 0.022413149047441166, "step": 30}, {"loss": 1.8908, "grad_norm": 0.515082597732544, "learning_rate": 0.0002, "epoch": 0.029884198729921554, "step": 40}, {"loss": 1.9704, "grad_norm": 0.5299215316772461, "learning_rate": 0.0002, "epoch": 0.03735524841240194, "step": 50}, {"loss": 1.9225, "grad_norm": 0.4951399862766266, "learning_rate": 0.0002, "epoch": 0.04482629809488233, "step": 60}, {"loss": 1.9742, "grad_norm": 0.48079821467399597, "learning_rate": 0.0002, "epoch": 0.05229734777736272, "step": 70}, {"loss": 1.9466, "grad_norm": 0.49402132630348206, "learning_rate": 0.0002, "epoch": 0.05976839745984311, "step": 80}, {"loss": 1.8691, "grad_norm": 0.4778193235397339, "learning_rate": 0.0002, "epoch": 0.0672394471423235, "step": 90}, {"loss": 1.8455, "grad_norm": 0.42472657561302185, "learning_rate": 0.0002, "epoch": 0.07471049682480388, "step": 100}, {"loss": 1.8744, "grad_norm": 0.4433092474937439, "learning_rate": 0.0002, "epoch": 0.08218154650728428, "step": 110}, {"loss": 1.865, "grad_norm": 0.4472862780094147, "learning_rate": 0.0002, "epoch": 0.08965259618976466, "step": 120}, {"loss": 1.9256, "grad_norm": 0.42596298456192017, "learning_rate": 0.0002, "epoch": 0.09712364587224505, "step": 130}, {"loss": 1.8015, "grad_norm": 0.46645811200141907, "learning_rate": 0.0002, "epoch": 0.10459469555472543, "step": 140}, {"loss": 1.8307, "grad_norm": 0.41041234135627747, "learning_rate": 0.0002, "epoch": 0.11206574523720583, "step": 150}, {"loss": 1.8276, "grad_norm": 0.5329819917678833, "learning_rate": 0.0002, "epoch": 0.11953679491968622, "step": 160}, {"loss": 1.8118, "grad_norm": 0.4065922200679779, "learning_rate": 0.0002, "epoch": 0.1270078446021666, "step": 170}, {"loss": 1.8559, "grad_norm": 0.38406994938850403, "learning_rate": 0.0002, "epoch": 0.134478894284647, "step": 180}, {"loss": 1.8647, "grad_norm": 0.4246881306171417, "learning_rate": 0.0002, "epoch": 0.14194994396712737, "step": 190}, {"loss": 1.8054, "grad_norm": 0.35136649012565613, "learning_rate": 0.0002, "epoch": 0.14942099364960776, "step": 200}, {"loss": 1.802, "grad_norm": 0.43252742290496826, "learning_rate": 0.0002, "epoch": 0.15689204333208817, "step": 210}, {"loss": 1.7823, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002, "epoch": 0.16436309301456856, "step": 220}, {"loss": 1.818, "grad_norm": 0.3748249113559723, "learning_rate": 0.0002, "epoch": 0.17183414269704894, "step": 230}, {"loss": 1.866, "grad_norm": 0.6432855725288391, "learning_rate": 0.0002, "epoch": 0.17930519237952933, "step": 240}, {"loss": 1.8397, "grad_norm": 0.34874802827835083, "learning_rate": 0.0002, "epoch": 0.1867762420620097, "step": 250}, {"loss": 1.79, "grad_norm": 0.3721984326839447, "learning_rate": 0.0002, "epoch": 0.1942472917444901, "step": 260}, {"loss": 1.8464, "grad_norm": 0.4339311420917511, "learning_rate": 0.0002, "epoch": 0.20171834142697048, "step": 270}, {"loss": 1.8665, "grad_norm": 0.4018215537071228, "learning_rate": 0.0002, "epoch": 0.20918939110945087, "step": 280}, {"loss": 1.8048, "grad_norm": 0.3278839886188507, "learning_rate": 0.0002, "epoch": 0.21666044079193125, "step": 290}, {"loss": 1.7395, "grad_norm": 0.36146077513694763, "learning_rate": 0.0002, "epoch": 0.22413149047441167, "step": 300}, {"loss": 1.7916, "grad_norm": 0.38175010681152344, "learning_rate": 0.0002, "epoch": 0.23160254015689205, "step": 310}, {"loss": 1.8593, "grad_norm": 0.44776618480682373, "learning_rate": 0.0002, "epoch": 0.23907358983937244, "step": 320}, {"loss": 1.7824, "grad_norm": 0.3933652937412262, "learning_rate": 0.0002, "epoch": 0.24654463952185282, "step": 330}, {"loss": 1.8393, "grad_norm": 0.3515005111694336, "learning_rate": 0.0002, "epoch": 0.2540156892043332, "step": 340}, {"loss": 1.8653, "grad_norm": 0.6683304309844971, "learning_rate": 0.0002, "epoch": 0.2614867388868136, "step": 350}, {"loss": 1.8797, "grad_norm": 0.37093454599380493, "learning_rate": 0.0002, "epoch": 0.268957788569294, "step": 360}, {"loss": 1.8251, "grad_norm": 0.3450651168823242, "learning_rate": 0.0002, "epoch": 0.2764288382517744, "step": 370}, {"loss": 1.7435, "grad_norm": 0.5140917301177979, "learning_rate": 0.0002, "epoch": 0.28389988793425475, "step": 380}, {"loss": 1.8026, "grad_norm": 0.32885563373565674, "learning_rate": 0.0002, "epoch": 0.29137093761673516, "step": 390}, {"loss": 1.8174, "grad_norm": 0.33962297439575195, "learning_rate": 0.0002, "epoch": 0.2988419872992155, "step": 400}, {"loss": 1.7467, "grad_norm": 0.3723141849040985, "learning_rate": 0.0002, "epoch": 0.30631303698169593, "step": 410}, {"loss": 1.8459, "grad_norm": 0.37173134088516235, "learning_rate": 0.0002, "epoch": 0.31378408666417634, "step": 420}, {"loss": 1.8876, "grad_norm": 0.33736956119537354, "learning_rate": 0.0002, "epoch": 0.3212551363466567, "step": 430}, {"loss": 1.8367, "grad_norm": 0.3602448105812073, "learning_rate": 0.0002, "epoch": 0.3287261860291371, "step": 440}, {"loss": 1.8058, "grad_norm": 0.3569699227809906, "learning_rate": 0.0002, "epoch": 0.33619723571161747, "step": 450}, {"loss": 1.8086, "grad_norm": 0.31009167432785034, "learning_rate": 0.0002, "epoch": 0.3436682853940979, "step": 460}, {"loss": 1.8876, "grad_norm": 0.5278693437576294, "learning_rate": 0.0002, "epoch": 0.35113933507657824, "step": 470}, {"loss": 1.8534, "grad_norm": 0.3587537109851837, "learning_rate": 0.0002, "epoch": 0.35861038475905865, "step": 480}, {"loss": 1.8046, "grad_norm": 0.3859670162200928, "learning_rate": 0.0002, "epoch": 0.366081434441539, "step": 490}, {"loss": 1.8287, "grad_norm": 0.395913690328598, "learning_rate": 0.0002, "epoch": 0.3735524841240194, "step": 500}, {"loss": 1.7619, "grad_norm": 0.35052940249443054, "learning_rate": 0.0002, "epoch": 0.38102353380649984, "step": 510}, {"loss": 1.7824, "grad_norm": 0.2979494333267212, "learning_rate": 0.0002, "epoch": 0.3884945834889802, "step": 520}, {"loss": 1.8641, "grad_norm": 0.3062683343887329, "learning_rate": 0.0002, "epoch": 0.3959656331714606, "step": 530}, {"loss": 1.7651, "grad_norm": 0.3172847330570221, "learning_rate": 0.0002, "epoch": 0.40343668285394096, "step": 540}, {"loss": 1.806, "grad_norm": 0.360435426235199, "learning_rate": 0.0002, "epoch": 0.4109077325364214, "step": 550}, {"loss": 1.9054, "grad_norm": 0.3427872359752655, "learning_rate": 0.0002, "epoch": 0.41837878221890173, "step": 560}, {"loss": 1.7562, "grad_norm": 0.34036558866500854, "learning_rate": 0.0002, "epoch": 0.42584983190138215, "step": 570}, {"loss": 1.7254, "grad_norm": 0.3365345299243927, "learning_rate": 0.0002, "epoch": 0.4333208815838625, "step": 580}, {"loss": 1.8328, "grad_norm": 0.35619041323661804, "learning_rate": 0.0002, "epoch": 0.4407919312663429, "step": 590}, {"loss": 1.8114, "grad_norm": 0.3569088280200958, "learning_rate": 0.0002, "epoch": 0.44826298094882333, "step": 600}, {"loss": 1.8599, "grad_norm": 0.3581278622150421, "learning_rate": 0.0002, "epoch": 0.4557340306313037, "step": 610}, {"loss": 1.7078, "grad_norm": 0.43197110295295715, "learning_rate": 0.0002, "epoch": 0.4632050803137841, "step": 620}, {"loss": 1.8257, "grad_norm": 0.33966198563575745, "learning_rate": 0.0002, "epoch": 0.47067612999626446, "step": 630}, {"loss": 1.7528, "grad_norm": 0.3343866467475891, "learning_rate": 0.0002, "epoch": 0.47814717967874487, "step": 640}, {"loss": 1.8191, "grad_norm": 0.33878564834594727, "learning_rate": 0.0002, "epoch": 0.48561822936122523, "step": 650}, {"loss": 1.8801, "grad_norm": 0.387195885181427, "learning_rate": 0.0002, "epoch": 0.49308927904370564, "step": 660}, {"loss": 1.7559, "grad_norm": 0.3755440413951874, "learning_rate": 0.0002, "epoch": 0.500560328726186, "step": 670}, {"loss": 1.8057, "grad_norm": 0.3272816836833954, "learning_rate": 0.0002, "epoch": 0.5080313784086664, "step": 680}, {"loss": 1.8156, "grad_norm": 0.36063864827156067, "learning_rate": 0.0002, "epoch": 0.5155024280911468, "step": 690}, {"loss": 1.8397, "grad_norm": 0.35317373275756836, "learning_rate": 0.0002, "epoch": 0.5229734777736272, "step": 700}, {"loss": 1.7603, "grad_norm": 0.3561195433139801, "learning_rate": 0.0002, "epoch": 0.5304445274561076, "step": 710}, {"loss": 1.8149, "grad_norm": 0.31124624609947205, "learning_rate": 0.0002, "epoch": 0.537915577138588, "step": 720}, {"loss": 1.7434, "grad_norm": 0.3294544517993927, "learning_rate": 0.0002, "epoch": 0.5453866268210683, "step": 730}, {"loss": 1.8027, "grad_norm": 0.31933900713920593, "learning_rate": 0.0002, "epoch": 0.5528576765035488, "step": 740}, {"loss": 1.7601, "grad_norm": 0.3226020634174347, "learning_rate": 0.0002, "epoch": 0.5603287261860291, "step": 750}, {"loss": 1.7862, "grad_norm": 0.3147525489330292, "learning_rate": 0.0002, "epoch": 0.5677997758685095, "step": 760}, {"loss": 1.9028, "grad_norm": 0.32234328985214233, "learning_rate": 0.0002, "epoch": 0.57527082555099, "step": 770}, {"loss": 1.7623, "grad_norm": 0.3258664309978485, "learning_rate": 0.0002, "epoch": 0.5827418752334703, "step": 780}, {"loss": 1.7384, "grad_norm": 0.3166961967945099, "learning_rate": 0.0002, "epoch": 0.5902129249159507, "step": 790}, {"loss": 1.8799, "grad_norm": 0.35621458292007446, "learning_rate": 0.0002, "epoch": 0.597683974598431, "step": 800}, {"loss": 1.8313, "grad_norm": 0.3236999213695526, "learning_rate": 0.0002, "epoch": 0.6051550242809115, "step": 810}, {"loss": 1.7132, "grad_norm": 0.2892923653125763, "learning_rate": 0.0002, "epoch": 0.6126260739633919, "step": 820}, {"loss": 1.8709, "grad_norm": 0.4098321497440338, "learning_rate": 0.0002, "epoch": 0.6200971236458722, "step": 830}, {"loss": 1.7637, "grad_norm": 0.3337118923664093, "learning_rate": 0.0002, "epoch": 0.6275681733283527, "step": 840}, {"loss": 1.7375, "grad_norm": 0.30416029691696167, "learning_rate": 0.0002, "epoch": 0.635039223010833, "step": 850}, {"loss": 1.7419, "grad_norm": 0.3361026346683502, "learning_rate": 0.0002, "epoch": 0.6425102726933134, "step": 860}, {"loss": 1.732, "grad_norm": 0.3537365198135376, "learning_rate": 0.0002, "epoch": 0.6499813223757938, "step": 870}, {"loss": 1.7825, "grad_norm": 0.33854469656944275, "learning_rate": 0.0002, "epoch": 0.6574523720582742, "step": 880}, {"loss": 1.7561, "grad_norm": 0.3332272469997406, "learning_rate": 0.0002, "epoch": 0.6649234217407546, "step": 890}, {"loss": 1.7247, "grad_norm": 0.34954726696014404, "learning_rate": 0.0002, "epoch": 0.6723944714232349, "step": 900}, {"loss": 1.7917, "grad_norm": 0.2921750247478485, "learning_rate": 0.0002, "epoch": 0.6798655211057153, "step": 910}, {"loss": 1.7807, "grad_norm": 0.30508682131767273, "learning_rate": 0.0002, "epoch": 0.6873365707881958, "step": 920}, {"loss": 1.8082, "grad_norm": 0.32268425822257996, "learning_rate": 0.0002, "epoch": 0.6948076204706761, "step": 930}, {"loss": 1.8283, "grad_norm": 0.2844390869140625, "learning_rate": 0.0002, "epoch": 0.7022786701531565, "step": 940}, {"loss": 1.7363, "grad_norm": 0.31263890862464905, "learning_rate": 0.0002, "epoch": 0.709749719835637, "step": 950}, {"loss": 1.8081, "grad_norm": 0.3626808822154999, "learning_rate": 0.0002, "epoch": 0.7172207695181173, "step": 960}, {"loss": 1.853, "grad_norm": 0.3322749733924866, "learning_rate": 0.0002, "epoch": 0.7246918192005977, "step": 970}, {"loss": 1.7912, "grad_norm": 0.29177871346473694, "learning_rate": 0.0002, "epoch": 0.732162868883078, "step": 980}, {"loss": 1.8447, "grad_norm": 0.35405513644218445, "learning_rate": 0.0002, "epoch": 0.7396339185655585, "step": 990}, {"loss": 1.7008, "grad_norm": 0.39318400621414185, "learning_rate": 0.0002, "epoch": 0.7471049682480388, "step": 1000}, {"loss": 1.7803, "grad_norm": 0.29401418566703796, "learning_rate": 0.0002, "epoch": 0.7545760179305192, "step": 1010}, {"loss": 1.7649, "grad_norm": 0.3271748721599579, "learning_rate": 0.0002, "epoch": 0.7620470676129997, "step": 1020}, {"loss": 1.7266, "grad_norm": 0.30883970856666565, "learning_rate": 0.0002, "epoch": 0.76951811729548, "step": 1030}, {"loss": 1.7722, "grad_norm": 0.3411838412284851, "learning_rate": 0.0002, "epoch": 0.7769891669779604, "step": 1040}, {"loss": 1.829, "grad_norm": 0.30608129501342773, "learning_rate": 0.0002, "epoch": 0.7844602166604407, "step": 1050}, {"loss": 1.7815, "grad_norm": 0.30899080634117126, "learning_rate": 0.0002, "epoch": 0.7919312663429212, "step": 1060}, {"loss": 1.7625, "grad_norm": 0.3160453140735626, "learning_rate": 0.0002, "epoch": 0.7994023160254016, "step": 1070}, {"loss": 1.8452, "grad_norm": 0.30947187542915344, "learning_rate": 0.0002, "epoch": 0.8068733657078819, "step": 1080}, {"loss": 1.7418, "grad_norm": 0.3103134036064148, "learning_rate": 0.0002, "epoch": 0.8143444153903624, "step": 1090}, {"loss": 1.842, "grad_norm": 0.31771138310432434, "learning_rate": 0.0002, "epoch": 0.8218154650728428, "step": 1100}, {"loss": 1.7918, "grad_norm": 0.5860997438430786, "learning_rate": 0.0002, "epoch": 0.8292865147553231, "step": 1110}, {"loss": 1.8443, "grad_norm": 0.3230148255825043, "learning_rate": 0.0002, "epoch": 0.8367575644378035, "step": 1120}, {"loss": 1.8478, "grad_norm": 0.29611510038375854, "learning_rate": 0.0002, "epoch": 0.8442286141202839, "step": 1130}, {"loss": 1.7673, "grad_norm": 0.3373654782772064, "learning_rate": 0.0002, "epoch": 0.8516996638027643, "step": 1140}, {"loss": 1.7997, "grad_norm": 0.3474279046058655, "learning_rate": 0.0002, "epoch": 0.8591707134852447, "step": 1150}, {"loss": 1.75, "grad_norm": 0.35057875514030457, "learning_rate": 0.0002, "epoch": 0.866641763167725, "step": 1160}, {"loss": 1.8273, "grad_norm": 0.39537495374679565, "learning_rate": 0.0002, "epoch": 0.8741128128502055, "step": 1170}, {"loss": 1.7682, "grad_norm": 0.3714233636856079, "learning_rate": 0.0002, "epoch": 0.8815838625326858, "step": 1180}, {"loss": 1.7549, "grad_norm": 0.2950296998023987, "learning_rate": 0.0002, "epoch": 0.8890549122151662, "step": 1190}, {"loss": 1.7612, "grad_norm": 0.38182979822158813, "learning_rate": 0.0002, "epoch": 0.8965259618976467, "step": 1200}, {"loss": 1.827, "grad_norm": 0.27883678674697876, "learning_rate": 0.0002, "epoch": 0.903997011580127, "step": 1210}, {"loss": 1.7623, "grad_norm": 0.33874374628067017, "learning_rate": 0.0002, "epoch": 0.9114680612626074, "step": 1220}, {"loss": 1.7334, "grad_norm": 0.3014272153377533, "learning_rate": 0.0002, "epoch": 0.9189391109450877, "step": 1230}, {"loss": 1.8235, "grad_norm": 0.3194271922111511, "learning_rate": 0.0002, "epoch": 0.9264101606275682, "step": 1240}, {"loss": 1.7924, "grad_norm": 0.3049403429031372, "learning_rate": 0.0002, "epoch": 0.9338812103100486, "step": 1250}, {"loss": 1.7535, "grad_norm": 0.30621254444122314, "learning_rate": 0.0002, "epoch": 0.9413522599925289, "step": 1260}, {"loss": 1.8287, "grad_norm": 0.28675132989883423, "learning_rate": 0.0002, "epoch": 0.9488233096750094, "step": 1270}, {"loss": 1.7586, "grad_norm": 0.3322032690048218, "learning_rate": 0.0002, "epoch": 0.9562943593574897, "step": 1280}, {"loss": 1.8054, "grad_norm": 0.35408294200897217, "learning_rate": 0.0002, "epoch": 0.9637654090399701, "step": 1290}, {"loss": 1.7343, "grad_norm": 0.36386919021606445, "learning_rate": 0.0002, "epoch": 0.9712364587224505, "step": 1300}, {"loss": 1.8633, "grad_norm": 0.32338324189186096, "learning_rate": 0.0002, "epoch": 0.9787075084049309, "step": 1310}, {"loss": 1.7724, "grad_norm": 0.3714013993740082, "learning_rate": 0.0002, "epoch": 0.9861785580874113, "step": 1320}, {"loss": 1.7766, "grad_norm": 0.3133082389831543, "learning_rate": 0.0002, "epoch": 0.9936496077698916, "step": 1330}, {"eval_loss": 1.8051470518112183, "eval_runtime": 38.6332, "eval_samples_per_second": 13.331, "eval_steps_per_second": 1.682, "epoch": 0.9996264475158759, "step": 1338}, {"loss": 1.8035, "grad_norm": 0.31595754623413086, "learning_rate": 0.0002, "epoch": 1.001120657452372, "step": 1340}, {"loss": 1.7486, "grad_norm": 0.3095700144767761, "learning_rate": 0.0002, "epoch": 1.0085917071348525, "step": 1350}, {"loss": 1.6981, "grad_norm": 0.34677496552467346, "learning_rate": 0.0002, "epoch": 1.0160627568173328, "step": 1360}, {"loss": 1.7377, "grad_norm": 0.29108840227127075, "learning_rate": 0.0002, "epoch": 1.0235338064998132, "step": 1370}, {"loss": 1.7194, "grad_norm": 0.32356950640678406, "learning_rate": 0.0002, "epoch": 1.0310048561822935, "step": 1380}, {"loss": 1.7593, "grad_norm": 0.4200669229030609, "learning_rate": 0.0002, "epoch": 1.038475905864774, "step": 1390}, {"loss": 1.797, "grad_norm": 0.3283711373806, "learning_rate": 0.0002, "epoch": 1.0459469555472545, "step": 1400}, {"loss": 1.7163, "grad_norm": 0.32898256182670593, "learning_rate": 0.0002, "epoch": 1.0534180052297348, "step": 1410}, {"loss": 1.7559, "grad_norm": 0.38790300488471985, "learning_rate": 0.0002, "epoch": 1.0608890549122152, "step": 1420}, {"loss": 1.6922, "grad_norm": 0.339800089597702, "learning_rate": 0.0002, "epoch": 1.0683601045946955, "step": 1430}, {"loss": 1.7076, "grad_norm": 0.3548751175403595, "learning_rate": 0.0002, "epoch": 1.075831154277176, "step": 1440}, {"loss": 1.6985, "grad_norm": 0.35114359855651855, "learning_rate": 0.0002, "epoch": 1.0833022039596563, "step": 1450}, {"loss": 1.7217, "grad_norm": 0.35226720571517944, "learning_rate": 0.0002, "epoch": 1.0907732536421366, "step": 1460}, {"loss": 1.6822, "grad_norm": 0.33665576577186584, "learning_rate": 0.0002, "epoch": 1.0982443033246172, "step": 1470}, {"loss": 1.6699, "grad_norm": 0.363889217376709, "learning_rate": 0.0002, "epoch": 1.1057153530070976, "step": 1480}, {"loss": 1.7933, "grad_norm": 0.3826201856136322, "learning_rate": 0.0002, "epoch": 1.113186402689578, "step": 1490}, {"loss": 1.7022, "grad_norm": 0.34058740735054016, "learning_rate": 0.0002, "epoch": 1.1206574523720583, "step": 1500}, {"loss": 1.6375, "grad_norm": 0.3462134301662445, "learning_rate": 0.0002, "epoch": 1.1281285020545386, "step": 1510}, {"loss": 1.7147, "grad_norm": 0.3396756052970886, "learning_rate": 0.0002, "epoch": 1.135599551737019, "step": 1520}, {"loss": 1.7219, "grad_norm": 0.32004743814468384, "learning_rate": 0.0002, "epoch": 1.1430706014194993, "step": 1530}, {"loss": 1.743, "grad_norm": 0.3397733271121979, "learning_rate": 0.0002, "epoch": 1.15054165110198, "step": 1540}, {"loss": 1.7333, "grad_norm": 0.3783262073993683, "learning_rate": 0.0002, "epoch": 1.1580127007844603, "step": 1550}, {"loss": 1.6075, "grad_norm": 0.35121291875839233, "learning_rate": 0.0002, "epoch": 1.1654837504669406, "step": 1560}, {"loss": 1.678, "grad_norm": 0.35816895961761475, "learning_rate": 0.0002, "epoch": 1.172954800149421, "step": 1570}, {"loss": 1.7143, "grad_norm": 0.33843839168548584, "learning_rate": 0.0002, "epoch": 1.1804258498319014, "step": 1580}, {"loss": 1.7434, "grad_norm": 0.3371972143650055, "learning_rate": 0.0002, "epoch": 1.1878968995143817, "step": 1590}, {"loss": 1.7671, "grad_norm": 0.36016878485679626, "learning_rate": 0.0002, "epoch": 1.195367949196862, "step": 1600}, {"loss": 1.6914, "grad_norm": 0.40879473090171814, "learning_rate": 0.0002, "epoch": 1.2028389988793426, "step": 1610}, {"loss": 1.6955, "grad_norm": 0.3216715455055237, "learning_rate": 0.0002, "epoch": 1.210310048561823, "step": 1620}, {"loss": 1.632, "grad_norm": 0.4482610821723938, "learning_rate": 0.0002, "epoch": 1.2177810982443034, "step": 1630}, {"loss": 1.6999, "grad_norm": 0.3257700502872467, "learning_rate": 0.0002, "epoch": 1.2252521479267837, "step": 1640}, {"loss": 1.7177, "grad_norm": 0.38646459579467773, "learning_rate": 0.0002, "epoch": 1.232723197609264, "step": 1650}, {"loss": 1.7081, "grad_norm": 0.4081360697746277, "learning_rate": 0.0002, "epoch": 1.2401942472917444, "step": 1660}, {"loss": 1.7519, "grad_norm": 0.4326848089694977, "learning_rate": 0.0002, "epoch": 1.2476652969742248, "step": 1670}, {"loss": 1.6752, "grad_norm": 0.346401572227478, "learning_rate": 0.0002, "epoch": 1.2551363466567054, "step": 1680}, {"loss": 1.7425, "grad_norm": 0.34536251425743103, "learning_rate": 0.0002, "epoch": 1.2626073963391857, "step": 1690}, {"loss": 1.7061, "grad_norm": 0.41359591484069824, "learning_rate": 0.0002, "epoch": 1.270078446021666, "step": 1700}, {"loss": 1.7906, "grad_norm": 0.3530874252319336, "learning_rate": 0.0002, "epoch": 1.2775494957041464, "step": 1710}, {"loss": 1.7357, "grad_norm": 0.3702719211578369, "learning_rate": 0.0002, "epoch": 1.2850205453866268, "step": 1720}, {"loss": 1.766, "grad_norm": 0.3703329563140869, "learning_rate": 0.0002, "epoch": 1.2924915950691072, "step": 1730}, {"loss": 1.7221, "grad_norm": 0.37919729948043823, "learning_rate": 0.0002, "epoch": 1.2999626447515875, "step": 1740}, {"loss": 1.7859, "grad_norm": 0.32526856660842896, "learning_rate": 0.0002, "epoch": 1.307433694434068, "step": 1750}, {"loss": 1.7117, "grad_norm": 0.36752620339393616, "learning_rate": 0.0002, "epoch": 1.3149047441165485, "step": 1760}, {"loss": 1.7335, "grad_norm": 0.3398192524909973, "learning_rate": 0.0002, "epoch": 1.3223757937990288, "step": 1770}, {"loss": 1.7492, "grad_norm": 0.37435585260391235, "learning_rate": 0.0002, "epoch": 1.3298468434815092, "step": 1780}, {"loss": 1.7393, "grad_norm": 0.35793280601501465, "learning_rate": 0.0002, "epoch": 1.3373178931639895, "step": 1790}, {"loss": 1.7266, "grad_norm": 0.35481882095336914, "learning_rate": 0.0002, "epoch": 1.3447889428464699, "step": 1800}, {"loss": 1.7456, "grad_norm": 0.3786393105983734, "learning_rate": 0.0002, "epoch": 1.3522599925289502, "step": 1810}, {"loss": 1.7169, "grad_norm": 0.33245593309402466, "learning_rate": 0.0002, "epoch": 1.3597310422114308, "step": 1820}, {"loss": 1.7577, "grad_norm": 0.35388344526290894, "learning_rate": 0.0002, "epoch": 1.3672020918939112, "step": 1830}, {"loss": 1.6968, "grad_norm": 0.3695325553417206, "learning_rate": 0.0002, "epoch": 1.3746731415763915, "step": 1840}, {"loss": 1.7086, "grad_norm": 0.3683604598045349, "learning_rate": 0.0002, "epoch": 1.382144191258872, "step": 1850}, {"loss": 1.7878, "grad_norm": 0.3753012418746948, "learning_rate": 0.0002, "epoch": 1.3896152409413522, "step": 1860}, {"loss": 1.6969, "grad_norm": 0.3331069350242615, "learning_rate": 0.0002, "epoch": 1.3970862906238326, "step": 1870}, {"loss": 1.6644, "grad_norm": 0.3877500295639038, "learning_rate": 0.0002, "epoch": 1.404557340306313, "step": 1880}, {"loss": 1.7586, "grad_norm": 0.33525151014328003, "learning_rate": 0.0002, "epoch": 1.4120283899887935, "step": 1890}, {"loss": 1.7031, "grad_norm": 0.3697299659252167, "learning_rate": 0.0002, "epoch": 1.4194994396712737, "step": 1900}, {"loss": 1.6956, "grad_norm": 0.4029286205768585, "learning_rate": 0.0002, "epoch": 1.4269704893537543, "step": 1910}, {"loss": 1.6897, "grad_norm": 0.3596203029155731, "learning_rate": 0.0002, "epoch": 1.4344415390362346, "step": 1920}, {"loss": 1.7139, "grad_norm": 0.450783908367157, "learning_rate": 0.0002, "epoch": 1.441912588718715, "step": 1930}, {"loss": 1.7243, "grad_norm": 0.3651481866836548, "learning_rate": 0.0002, "epoch": 1.4493836384011953, "step": 1940}, {"loss": 1.6637, "grad_norm": 0.3608424663543701, "learning_rate": 0.0002, "epoch": 1.4568546880836757, "step": 1950}, {"loss": 1.8285, "grad_norm": 0.39684420824050903, "learning_rate": 0.0002, "epoch": 1.4643257377661563, "step": 1960}, {"loss": 1.7514, "grad_norm": 0.34618663787841797, "learning_rate": 0.0002, "epoch": 1.4717967874486364, "step": 1970}, {"loss": 1.6655, "grad_norm": 0.4150386452674866, "learning_rate": 0.0002, "epoch": 1.479267837131117, "step": 1980}, {"loss": 1.7021, "grad_norm": 0.35500776767730713, "learning_rate": 0.0002, "epoch": 1.4867388868135973, "step": 1990}, {"loss": 1.7322, "grad_norm": 0.344144344329834, "learning_rate": 0.0002, "epoch": 1.4942099364960777, "step": 2000}, {"loss": 1.6998, "grad_norm": 0.3340149223804474, "learning_rate": 0.0002, "epoch": 1.501680986178558, "step": 2010}, {"loss": 1.7508, "grad_norm": 0.37685006856918335, "learning_rate": 0.0002, "epoch": 1.5091520358610384, "step": 2020}, {"loss": 1.8299, "grad_norm": 0.3699876368045807, "learning_rate": 0.0002, "epoch": 1.516623085543519, "step": 2030}, {"loss": 1.7357, "grad_norm": 0.3370307385921478, "learning_rate": 0.0002, "epoch": 1.5240941352259991, "step": 2040}, {"loss": 1.8044, "grad_norm": 0.37780630588531494, "learning_rate": 0.0002, "epoch": 1.5315651849084797, "step": 2050}, {"loss": 1.7408, "grad_norm": 0.370259165763855, "learning_rate": 0.0002, "epoch": 1.53903623459096, "step": 2060}, {"loss": 1.7398, "grad_norm": 0.3440011441707611, "learning_rate": 0.0002, "epoch": 1.5465072842734404, "step": 2070}, {"loss": 1.7105, "grad_norm": 0.40382063388824463, "learning_rate": 0.0002, "epoch": 1.5539783339559208, "step": 2080}, {"loss": 1.7071, "grad_norm": 0.38002029061317444, "learning_rate": 0.0002, "epoch": 1.5614493836384011, "step": 2090}, {"loss": 1.6815, "grad_norm": 0.3658451437950134, "learning_rate": 0.0002, "epoch": 1.5689204333208817, "step": 2100}, {"loss": 1.7598, "grad_norm": 0.354842871427536, "learning_rate": 0.0002, "epoch": 1.5763914830033618, "step": 2110}, {"loss": 1.6898, "grad_norm": 0.34735530614852905, "learning_rate": 0.0002, "epoch": 1.5838625326858424, "step": 2120}, {"loss": 1.7363, "grad_norm": 0.377581924200058, "learning_rate": 0.0002, "epoch": 1.5913335823683228, "step": 2130}, {"loss": 1.7789, "grad_norm": 0.41254034638404846, "learning_rate": 0.0002, "epoch": 1.5988046320508031, "step": 2140}, {"loss": 1.6782, "grad_norm": 0.3630715310573578, "learning_rate": 0.0002, "epoch": 1.6062756817332835, "step": 2150}, {"loss": 1.7531, "grad_norm": 0.36980143189430237, "learning_rate": 0.0002, "epoch": 1.6137467314157639, "step": 2160}, {"loss": 1.6847, "grad_norm": 0.3634769320487976, "learning_rate": 0.0002, "epoch": 1.6212177810982444, "step": 2170}, {"loss": 1.6367, "grad_norm": 0.3794139623641968, "learning_rate": 0.0002, "epoch": 1.6286888307807246, "step": 2180}, {"loss": 1.7064, "grad_norm": 0.359742134809494, "learning_rate": 0.0002, "epoch": 1.6361598804632052, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3770543932914734, "learning_rate": 0.0002, "epoch": 1.6436309301456855, "step": 2200}, {"loss": 1.784, "grad_norm": 0.3797036409378052, "learning_rate": 0.0002, "epoch": 1.6511019798281659, "step": 2210}, {"loss": 1.7875, "grad_norm": 0.35622093081474304, "learning_rate": 0.0002, "epoch": 1.6585730295106462, "step": 2220}, {"loss": 1.6615, "grad_norm": 0.34552520513534546, "learning_rate": 0.0002, "epoch": 1.6660440791931266, "step": 2230}, {"loss": 1.7522, "grad_norm": 0.379926860332489, "learning_rate": 0.0002, "epoch": 1.6735151288756072, "step": 2240}, {"loss": 1.7953, "grad_norm": 0.37083810567855835, "learning_rate": 0.0002, "epoch": 1.6809861785580873, "step": 2250}, {"loss": 1.7485, "grad_norm": 0.42746543884277344, "learning_rate": 0.0002, "epoch": 1.6884572282405679, "step": 2260}, {"loss": 1.776, "grad_norm": 0.3372884690761566, "learning_rate": 0.0002, "epoch": 1.6959282779230482, "step": 2270}, {"loss": 1.7604, "grad_norm": 0.35220256447792053, "learning_rate": 0.0002, "epoch": 1.7033993276055286, "step": 2280}, {"loss": 1.7154, "grad_norm": 0.3659130930900574, "learning_rate": 0.0002, "epoch": 1.710870377288009, "step": 2290}, {"loss": 1.6953, "grad_norm": 0.37629297375679016, "learning_rate": 0.0002, "epoch": 1.7183414269704893, "step": 2300}, {"loss": 1.7212, "grad_norm": 0.36312398314476013, "learning_rate": 0.0002, "epoch": 1.7258124766529699, "step": 2310}, {"loss": 1.7903, "grad_norm": 0.467709481716156, "learning_rate": 0.0002, "epoch": 1.73328352633545, "step": 2320}, {"loss": 1.696, "grad_norm": 0.38685527443885803, "learning_rate": 0.0002, "epoch": 1.7407545760179306, "step": 2330}, {"loss": 1.7041, "grad_norm": 0.3578338325023651, "learning_rate": 0.0002, "epoch": 1.748225625700411, "step": 2340}, {"loss": 1.6456, "grad_norm": 0.36057502031326294, "learning_rate": 0.0002, "epoch": 1.7556966753828913, "step": 2350}, {"loss": 1.6853, "grad_norm": 0.3615196645259857, "learning_rate": 0.0002, "epoch": 1.7631677250653717, "step": 2360}, {"loss": 1.7612, "grad_norm": 0.4118947684764862, "learning_rate": 0.0002, "epoch": 1.770638774747852, "step": 2370}, {"loss": 1.6946, "grad_norm": 0.4067276120185852, "learning_rate": 0.0002, "epoch": 1.7781098244303326, "step": 2380}, {"loss": 1.712, "grad_norm": 0.3979823887348175, "learning_rate": 0.0002, "epoch": 1.7855808741128127, "step": 2390}, {"loss": 1.7644, "grad_norm": 0.44045883417129517, "learning_rate": 0.0002, "epoch": 1.7930519237952933, "step": 2400}, {"loss": 1.7251, "grad_norm": 0.3998069167137146, "learning_rate": 0.0002, "epoch": 1.8005229734777737, "step": 2410}, {"loss": 1.7354, "grad_norm": 0.3450094759464264, "learning_rate": 0.0002, "epoch": 1.807994023160254, "step": 2420}, {"loss": 1.6998, "grad_norm": 0.3759009838104248, "learning_rate": 0.0002, "epoch": 1.8154650728427344, "step": 2430}, {"loss": 1.7706, "grad_norm": 0.34347015619277954, "learning_rate": 0.0002, "epoch": 1.8229361225252148, "step": 2440}, {"loss": 1.7345, "grad_norm": 0.3511228859424591, "learning_rate": 0.0002, "epoch": 1.8304071722076953, "step": 2450}, {"loss": 1.6909, "grad_norm": 0.36853715777397156, "learning_rate": 0.0002, "epoch": 1.8378782218901755, "step": 2460}, {"loss": 1.6931, "grad_norm": 0.40659376978874207, "learning_rate": 0.0002, "epoch": 1.845349271572656, "step": 2470}, {"loss": 1.7626, "grad_norm": 0.39621320366859436, "learning_rate": 0.0002, "epoch": 1.8528203212551362, "step": 2480}, {"loss": 1.7427, "grad_norm": 0.3753979504108429, "learning_rate": 0.0002, "epoch": 1.8602913709376168, "step": 2490}, {"loss": 1.6622, "grad_norm": 0.3811938464641571, "learning_rate": 0.0002, "epoch": 1.8677624206200971, "step": 2500}, {"loss": 1.7718, "grad_norm": 0.3432596027851105, "learning_rate": 0.0002, "epoch": 1.8752334703025775, "step": 2510}, {"loss": 1.7488, "grad_norm": 0.3670712113380432, "learning_rate": 0.0002, "epoch": 1.882704519985058, "step": 2520}, {"loss": 1.705, "grad_norm": 0.40907177329063416, "learning_rate": 0.0002, "epoch": 1.8901755696675382, "step": 2530}, {"loss": 1.7148, "grad_norm": 0.3821999728679657, "learning_rate": 0.0002, "epoch": 1.8976466193500188, "step": 2540}, {"loss": 1.7934, "grad_norm": 0.36173978447914124, "learning_rate": 0.0002, "epoch": 1.905117669032499, "step": 2550}, {"loss": 1.6939, "grad_norm": 0.38990336656570435, "learning_rate": 0.0002, "epoch": 1.9125887187149795, "step": 2560}, {"loss": 1.6893, "grad_norm": 0.35242322087287903, "learning_rate": 0.0002, "epoch": 1.9200597683974598, "step": 2570}, {"loss": 1.7268, "grad_norm": 0.3506428003311157, "learning_rate": 0.0002, "epoch": 1.9275308180799402, "step": 2580}, {"loss": 1.6953, "grad_norm": 0.39540135860443115, "learning_rate": 0.0002, "epoch": 1.9350018677624208, "step": 2590}, {"loss": 1.6511, "grad_norm": 0.3444725573062897, "learning_rate": 0.0002, "epoch": 1.942472917444901, "step": 2600}, {"loss": 1.7259, "grad_norm": 0.3963521718978882, "learning_rate": 0.0002, "epoch": 1.9499439671273815, "step": 2610}, {"loss": 1.6946, "grad_norm": 0.3689815402030945, "learning_rate": 0.0002, "epoch": 1.9574150168098616, "step": 2620}, {"loss": 1.7384, "grad_norm": 0.3482626676559448, "learning_rate": 0.0002, "epoch": 1.9648860664923422, "step": 2630}, {"loss": 1.7048, "grad_norm": 0.35832616686820984, "learning_rate": 0.0002, "epoch": 1.9723571161748226, "step": 2640}, {"loss": 1.6681, "grad_norm": 0.4776208996772766, "learning_rate": 0.0002, "epoch": 1.979828165857303, "step": 2650}, {"loss": 1.6696, "grad_norm": 0.32570165395736694, "learning_rate": 0.0002, "epoch": 1.9872992155397835, "step": 2660}, {"loss": 1.7232, "grad_norm": 0.3380725085735321, "learning_rate": 0.0002, "epoch": 1.9947702652222636, "step": 2670}]} +{"epoch": 2.999626447515876, "step": 4015, "epoch_duration": 1438.387151002884, "total_accumulated_duration": 4312.949033737183, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6181, "grad_norm": 0.4912872612476349, "learning_rate": 0.0002, "epoch": 0.007471049682480389, "step": 10}, {"loss": 2.2606, "grad_norm": 0.4856316149234772, "learning_rate": 0.0002, "epoch": 0.014942099364960777, "step": 20}, {"loss": 2.0957, "grad_norm": 0.47683125734329224, "learning_rate": 0.0002, "epoch": 0.022413149047441166, "step": 30}, {"loss": 1.8908, "grad_norm": 0.515082597732544, "learning_rate": 0.0002, "epoch": 0.029884198729921554, "step": 40}, {"loss": 1.9704, "grad_norm": 0.5299215316772461, "learning_rate": 0.0002, "epoch": 0.03735524841240194, "step": 50}, {"loss": 1.9225, "grad_norm": 0.4951399862766266, "learning_rate": 0.0002, "epoch": 0.04482629809488233, "step": 60}, {"loss": 1.9742, "grad_norm": 0.48079821467399597, "learning_rate": 0.0002, "epoch": 0.05229734777736272, "step": 70}, {"loss": 1.9466, "grad_norm": 0.49402132630348206, "learning_rate": 0.0002, "epoch": 0.05976839745984311, "step": 80}, {"loss": 1.8691, "grad_norm": 0.4778193235397339, "learning_rate": 0.0002, "epoch": 0.0672394471423235, "step": 90}, {"loss": 1.8455, "grad_norm": 0.42472657561302185, "learning_rate": 0.0002, "epoch": 0.07471049682480388, "step": 100}, {"loss": 1.8744, "grad_norm": 0.4433092474937439, "learning_rate": 0.0002, "epoch": 0.08218154650728428, "step": 110}, {"loss": 1.865, "grad_norm": 0.4472862780094147, "learning_rate": 0.0002, "epoch": 0.08965259618976466, "step": 120}, {"loss": 1.9256, "grad_norm": 0.42596298456192017, "learning_rate": 0.0002, "epoch": 0.09712364587224505, "step": 130}, {"loss": 1.8015, "grad_norm": 0.46645811200141907, "learning_rate": 0.0002, "epoch": 0.10459469555472543, "step": 140}, {"loss": 1.8307, "grad_norm": 0.41041234135627747, "learning_rate": 0.0002, "epoch": 0.11206574523720583, "step": 150}, {"loss": 1.8276, "grad_norm": 0.5329819917678833, "learning_rate": 0.0002, "epoch": 0.11953679491968622, "step": 160}, {"loss": 1.8118, "grad_norm": 0.4065922200679779, "learning_rate": 0.0002, "epoch": 0.1270078446021666, "step": 170}, {"loss": 1.8559, "grad_norm": 0.38406994938850403, "learning_rate": 0.0002, "epoch": 0.134478894284647, "step": 180}, {"loss": 1.8647, "grad_norm": 0.4246881306171417, "learning_rate": 0.0002, "epoch": 0.14194994396712737, "step": 190}, {"loss": 1.8054, "grad_norm": 0.35136649012565613, "learning_rate": 0.0002, "epoch": 0.14942099364960776, "step": 200}, {"loss": 1.802, "grad_norm": 0.43252742290496826, "learning_rate": 0.0002, "epoch": 0.15689204333208817, "step": 210}, {"loss": 1.7823, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002, "epoch": 0.16436309301456856, "step": 220}, {"loss": 1.818, "grad_norm": 0.3748249113559723, "learning_rate": 0.0002, "epoch": 0.17183414269704894, "step": 230}, {"loss": 1.866, "grad_norm": 0.6432855725288391, "learning_rate": 0.0002, "epoch": 0.17930519237952933, "step": 240}, {"loss": 1.8397, "grad_norm": 0.34874802827835083, "learning_rate": 0.0002, "epoch": 0.1867762420620097, "step": 250}, {"loss": 1.79, "grad_norm": 0.3721984326839447, "learning_rate": 0.0002, "epoch": 0.1942472917444901, "step": 260}, {"loss": 1.8464, "grad_norm": 0.4339311420917511, "learning_rate": 0.0002, "epoch": 0.20171834142697048, "step": 270}, {"loss": 1.8665, "grad_norm": 0.4018215537071228, "learning_rate": 0.0002, "epoch": 0.20918939110945087, "step": 280}, {"loss": 1.8048, "grad_norm": 0.3278839886188507, "learning_rate": 0.0002, "epoch": 0.21666044079193125, "step": 290}, {"loss": 1.7395, "grad_norm": 0.36146077513694763, "learning_rate": 0.0002, "epoch": 0.22413149047441167, "step": 300}, {"loss": 1.7916, "grad_norm": 0.38175010681152344, "learning_rate": 0.0002, "epoch": 0.23160254015689205, "step": 310}, {"loss": 1.8593, "grad_norm": 0.44776618480682373, "learning_rate": 0.0002, "epoch": 0.23907358983937244, "step": 320}, {"loss": 1.7824, "grad_norm": 0.3933652937412262, "learning_rate": 0.0002, "epoch": 0.24654463952185282, "step": 330}, {"loss": 1.8393, "grad_norm": 0.3515005111694336, "learning_rate": 0.0002, "epoch": 0.2540156892043332, "step": 340}, {"loss": 1.8653, "grad_norm": 0.6683304309844971, "learning_rate": 0.0002, "epoch": 0.2614867388868136, "step": 350}, {"loss": 1.8797, "grad_norm": 0.37093454599380493, "learning_rate": 0.0002, "epoch": 0.268957788569294, "step": 360}, {"loss": 1.8251, "grad_norm": 0.3450651168823242, "learning_rate": 0.0002, "epoch": 0.2764288382517744, "step": 370}, {"loss": 1.7435, "grad_norm": 0.5140917301177979, "learning_rate": 0.0002, "epoch": 0.28389988793425475, "step": 380}, {"loss": 1.8026, "grad_norm": 0.32885563373565674, "learning_rate": 0.0002, "epoch": 0.29137093761673516, "step": 390}, {"loss": 1.8174, "grad_norm": 0.33962297439575195, "learning_rate": 0.0002, "epoch": 0.2988419872992155, "step": 400}, {"loss": 1.7467, "grad_norm": 0.3723141849040985, "learning_rate": 0.0002, "epoch": 0.30631303698169593, "step": 410}, {"loss": 1.8459, "grad_norm": 0.37173134088516235, "learning_rate": 0.0002, "epoch": 0.31378408666417634, "step": 420}, {"loss": 1.8876, "grad_norm": 0.33736956119537354, "learning_rate": 0.0002, "epoch": 0.3212551363466567, "step": 430}, {"loss": 1.8367, "grad_norm": 0.3602448105812073, "learning_rate": 0.0002, "epoch": 0.3287261860291371, "step": 440}, {"loss": 1.8058, "grad_norm": 0.3569699227809906, "learning_rate": 0.0002, "epoch": 0.33619723571161747, "step": 450}, {"loss": 1.8086, "grad_norm": 0.31009167432785034, "learning_rate": 0.0002, "epoch": 0.3436682853940979, "step": 460}, {"loss": 1.8876, "grad_norm": 0.5278693437576294, "learning_rate": 0.0002, "epoch": 0.35113933507657824, "step": 470}, {"loss": 1.8534, "grad_norm": 0.3587537109851837, "learning_rate": 0.0002, "epoch": 0.35861038475905865, "step": 480}, {"loss": 1.8046, "grad_norm": 0.3859670162200928, "learning_rate": 0.0002, "epoch": 0.366081434441539, "step": 490}, {"loss": 1.8287, "grad_norm": 0.395913690328598, "learning_rate": 0.0002, "epoch": 0.3735524841240194, "step": 500}, {"loss": 1.7619, "grad_norm": 0.35052940249443054, "learning_rate": 0.0002, "epoch": 0.38102353380649984, "step": 510}, {"loss": 1.7824, "grad_norm": 0.2979494333267212, "learning_rate": 0.0002, "epoch": 0.3884945834889802, "step": 520}, {"loss": 1.8641, "grad_norm": 0.3062683343887329, "learning_rate": 0.0002, "epoch": 0.3959656331714606, "step": 530}, {"loss": 1.7651, "grad_norm": 0.3172847330570221, "learning_rate": 0.0002, "epoch": 0.40343668285394096, "step": 540}, {"loss": 1.806, "grad_norm": 0.360435426235199, "learning_rate": 0.0002, "epoch": 0.4109077325364214, "step": 550}, {"loss": 1.9054, "grad_norm": 0.3427872359752655, "learning_rate": 0.0002, "epoch": 0.41837878221890173, "step": 560}, {"loss": 1.7562, "grad_norm": 0.34036558866500854, "learning_rate": 0.0002, "epoch": 0.42584983190138215, "step": 570}, {"loss": 1.7254, "grad_norm": 0.3365345299243927, "learning_rate": 0.0002, "epoch": 0.4333208815838625, "step": 580}, {"loss": 1.8328, "grad_norm": 0.35619041323661804, "learning_rate": 0.0002, "epoch": 0.4407919312663429, "step": 590}, {"loss": 1.8114, "grad_norm": 0.3569088280200958, "learning_rate": 0.0002, "epoch": 0.44826298094882333, "step": 600}, {"loss": 1.8599, "grad_norm": 0.3581278622150421, "learning_rate": 0.0002, "epoch": 0.4557340306313037, "step": 610}, {"loss": 1.7078, "grad_norm": 0.43197110295295715, "learning_rate": 0.0002, "epoch": 0.4632050803137841, "step": 620}, {"loss": 1.8257, "grad_norm": 0.33966198563575745, "learning_rate": 0.0002, "epoch": 0.47067612999626446, "step": 630}, {"loss": 1.7528, "grad_norm": 0.3343866467475891, "learning_rate": 0.0002, "epoch": 0.47814717967874487, "step": 640}, {"loss": 1.8191, "grad_norm": 0.33878564834594727, "learning_rate": 0.0002, "epoch": 0.48561822936122523, "step": 650}, {"loss": 1.8801, "grad_norm": 0.387195885181427, "learning_rate": 0.0002, "epoch": 0.49308927904370564, "step": 660}, {"loss": 1.7559, "grad_norm": 0.3755440413951874, "learning_rate": 0.0002, "epoch": 0.500560328726186, "step": 670}, {"loss": 1.8057, "grad_norm": 0.3272816836833954, "learning_rate": 0.0002, "epoch": 0.5080313784086664, "step": 680}, {"loss": 1.8156, "grad_norm": 0.36063864827156067, "learning_rate": 0.0002, "epoch": 0.5155024280911468, "step": 690}, {"loss": 1.8397, "grad_norm": 0.35317373275756836, "learning_rate": 0.0002, "epoch": 0.5229734777736272, "step": 700}, {"loss": 1.7603, "grad_norm": 0.3561195433139801, "learning_rate": 0.0002, "epoch": 0.5304445274561076, "step": 710}, {"loss": 1.8149, "grad_norm": 0.31124624609947205, "learning_rate": 0.0002, "epoch": 0.537915577138588, "step": 720}, {"loss": 1.7434, "grad_norm": 0.3294544517993927, "learning_rate": 0.0002, "epoch": 0.5453866268210683, "step": 730}, {"loss": 1.8027, "grad_norm": 0.31933900713920593, "learning_rate": 0.0002, "epoch": 0.5528576765035488, "step": 740}, {"loss": 1.7601, "grad_norm": 0.3226020634174347, "learning_rate": 0.0002, "epoch": 0.5603287261860291, "step": 750}, {"loss": 1.7862, "grad_norm": 0.3147525489330292, "learning_rate": 0.0002, "epoch": 0.5677997758685095, "step": 760}, {"loss": 1.9028, "grad_norm": 0.32234328985214233, "learning_rate": 0.0002, "epoch": 0.57527082555099, "step": 770}, {"loss": 1.7623, "grad_norm": 0.3258664309978485, "learning_rate": 0.0002, "epoch": 0.5827418752334703, "step": 780}, {"loss": 1.7384, "grad_norm": 0.3166961967945099, "learning_rate": 0.0002, "epoch": 0.5902129249159507, "step": 790}, {"loss": 1.8799, "grad_norm": 0.35621458292007446, "learning_rate": 0.0002, "epoch": 0.597683974598431, "step": 800}, {"loss": 1.8313, "grad_norm": 0.3236999213695526, "learning_rate": 0.0002, "epoch": 0.6051550242809115, "step": 810}, {"loss": 1.7132, "grad_norm": 0.2892923653125763, "learning_rate": 0.0002, "epoch": 0.6126260739633919, "step": 820}, {"loss": 1.8709, "grad_norm": 0.4098321497440338, "learning_rate": 0.0002, "epoch": 0.6200971236458722, "step": 830}, {"loss": 1.7637, "grad_norm": 0.3337118923664093, "learning_rate": 0.0002, "epoch": 0.6275681733283527, "step": 840}, {"loss": 1.7375, "grad_norm": 0.30416029691696167, "learning_rate": 0.0002, "epoch": 0.635039223010833, "step": 850}, {"loss": 1.7419, "grad_norm": 0.3361026346683502, "learning_rate": 0.0002, "epoch": 0.6425102726933134, "step": 860}, {"loss": 1.732, "grad_norm": 0.3537365198135376, "learning_rate": 0.0002, "epoch": 0.6499813223757938, "step": 870}, {"loss": 1.7825, "grad_norm": 0.33854469656944275, "learning_rate": 0.0002, "epoch": 0.6574523720582742, "step": 880}, {"loss": 1.7561, "grad_norm": 0.3332272469997406, "learning_rate": 0.0002, "epoch": 0.6649234217407546, "step": 890}, {"loss": 1.7247, "grad_norm": 0.34954726696014404, "learning_rate": 0.0002, "epoch": 0.6723944714232349, "step": 900}, {"loss": 1.7917, "grad_norm": 0.2921750247478485, "learning_rate": 0.0002, "epoch": 0.6798655211057153, "step": 910}, {"loss": 1.7807, "grad_norm": 0.30508682131767273, "learning_rate": 0.0002, "epoch": 0.6873365707881958, "step": 920}, {"loss": 1.8082, "grad_norm": 0.32268425822257996, "learning_rate": 0.0002, "epoch": 0.6948076204706761, "step": 930}, {"loss": 1.8283, "grad_norm": 0.2844390869140625, "learning_rate": 0.0002, "epoch": 0.7022786701531565, "step": 940}, {"loss": 1.7363, "grad_norm": 0.31263890862464905, "learning_rate": 0.0002, "epoch": 0.709749719835637, "step": 950}, {"loss": 1.8081, "grad_norm": 0.3626808822154999, "learning_rate": 0.0002, "epoch": 0.7172207695181173, "step": 960}, {"loss": 1.853, "grad_norm": 0.3322749733924866, "learning_rate": 0.0002, "epoch": 0.7246918192005977, "step": 970}, {"loss": 1.7912, "grad_norm": 0.29177871346473694, "learning_rate": 0.0002, "epoch": 0.732162868883078, "step": 980}, {"loss": 1.8447, "grad_norm": 0.35405513644218445, "learning_rate": 0.0002, "epoch": 0.7396339185655585, "step": 990}, {"loss": 1.7008, "grad_norm": 0.39318400621414185, "learning_rate": 0.0002, "epoch": 0.7471049682480388, "step": 1000}, {"loss": 1.7803, "grad_norm": 0.29401418566703796, "learning_rate": 0.0002, "epoch": 0.7545760179305192, "step": 1010}, {"loss": 1.7649, "grad_norm": 0.3271748721599579, "learning_rate": 0.0002, "epoch": 0.7620470676129997, "step": 1020}, {"loss": 1.7266, "grad_norm": 0.30883970856666565, "learning_rate": 0.0002, "epoch": 0.76951811729548, "step": 1030}, {"loss": 1.7722, "grad_norm": 0.3411838412284851, "learning_rate": 0.0002, "epoch": 0.7769891669779604, "step": 1040}, {"loss": 1.829, "grad_norm": 0.30608129501342773, "learning_rate": 0.0002, "epoch": 0.7844602166604407, "step": 1050}, {"loss": 1.7815, "grad_norm": 0.30899080634117126, "learning_rate": 0.0002, "epoch": 0.7919312663429212, "step": 1060}, {"loss": 1.7625, "grad_norm": 0.3160453140735626, "learning_rate": 0.0002, "epoch": 0.7994023160254016, "step": 1070}, {"loss": 1.8452, "grad_norm": 0.30947187542915344, "learning_rate": 0.0002, "epoch": 0.8068733657078819, "step": 1080}, {"loss": 1.7418, "grad_norm": 0.3103134036064148, "learning_rate": 0.0002, "epoch": 0.8143444153903624, "step": 1090}, {"loss": 1.842, "grad_norm": 0.31771138310432434, "learning_rate": 0.0002, "epoch": 0.8218154650728428, "step": 1100}, {"loss": 1.7918, "grad_norm": 0.5860997438430786, "learning_rate": 0.0002, "epoch": 0.8292865147553231, "step": 1110}, {"loss": 1.8443, "grad_norm": 0.3230148255825043, "learning_rate": 0.0002, "epoch": 0.8367575644378035, "step": 1120}, {"loss": 1.8478, "grad_norm": 0.29611510038375854, "learning_rate": 0.0002, "epoch": 0.8442286141202839, "step": 1130}, {"loss": 1.7673, "grad_norm": 0.3373654782772064, "learning_rate": 0.0002, "epoch": 0.8516996638027643, "step": 1140}, {"loss": 1.7997, "grad_norm": 0.3474279046058655, "learning_rate": 0.0002, "epoch": 0.8591707134852447, "step": 1150}, {"loss": 1.75, "grad_norm": 0.35057875514030457, "learning_rate": 0.0002, "epoch": 0.866641763167725, "step": 1160}, {"loss": 1.8273, "grad_norm": 0.39537495374679565, "learning_rate": 0.0002, "epoch": 0.8741128128502055, "step": 1170}, {"loss": 1.7682, "grad_norm": 0.3714233636856079, "learning_rate": 0.0002, "epoch": 0.8815838625326858, "step": 1180}, {"loss": 1.7549, "grad_norm": 0.2950296998023987, "learning_rate": 0.0002, "epoch": 0.8890549122151662, "step": 1190}, {"loss": 1.7612, "grad_norm": 0.38182979822158813, "learning_rate": 0.0002, "epoch": 0.8965259618976467, "step": 1200}, {"loss": 1.827, "grad_norm": 0.27883678674697876, "learning_rate": 0.0002, "epoch": 0.903997011580127, "step": 1210}, {"loss": 1.7623, "grad_norm": 0.33874374628067017, "learning_rate": 0.0002, "epoch": 0.9114680612626074, "step": 1220}, {"loss": 1.7334, "grad_norm": 0.3014272153377533, "learning_rate": 0.0002, "epoch": 0.9189391109450877, "step": 1230}, {"loss": 1.8235, "grad_norm": 0.3194271922111511, "learning_rate": 0.0002, "epoch": 0.9264101606275682, "step": 1240}, {"loss": 1.7924, "grad_norm": 0.3049403429031372, "learning_rate": 0.0002, "epoch": 0.9338812103100486, "step": 1250}, {"loss": 1.7535, "grad_norm": 0.30621254444122314, "learning_rate": 0.0002, "epoch": 0.9413522599925289, "step": 1260}, {"loss": 1.8287, "grad_norm": 0.28675132989883423, "learning_rate": 0.0002, "epoch": 0.9488233096750094, "step": 1270}, {"loss": 1.7586, "grad_norm": 0.3322032690048218, "learning_rate": 0.0002, "epoch": 0.9562943593574897, "step": 1280}, {"loss": 1.8054, "grad_norm": 0.35408294200897217, "learning_rate": 0.0002, "epoch": 0.9637654090399701, "step": 1290}, {"loss": 1.7343, "grad_norm": 0.36386919021606445, "learning_rate": 0.0002, "epoch": 0.9712364587224505, "step": 1300}, {"loss": 1.8633, "grad_norm": 0.32338324189186096, "learning_rate": 0.0002, "epoch": 0.9787075084049309, "step": 1310}, {"loss": 1.7724, "grad_norm": 0.3714013993740082, "learning_rate": 0.0002, "epoch": 0.9861785580874113, "step": 1320}, {"loss": 1.7766, "grad_norm": 0.3133082389831543, "learning_rate": 0.0002, "epoch": 0.9936496077698916, "step": 1330}, {"eval_loss": 1.8051470518112183, "eval_runtime": 38.6332, "eval_samples_per_second": 13.331, "eval_steps_per_second": 1.682, "epoch": 0.9996264475158759, "step": 1338}, {"loss": 1.8035, "grad_norm": 0.31595754623413086, "learning_rate": 0.0002, "epoch": 1.001120657452372, "step": 1340}, {"loss": 1.7486, "grad_norm": 0.3095700144767761, "learning_rate": 0.0002, "epoch": 1.0085917071348525, "step": 1350}, {"loss": 1.6981, "grad_norm": 0.34677496552467346, "learning_rate": 0.0002, "epoch": 1.0160627568173328, "step": 1360}, {"loss": 1.7377, "grad_norm": 0.29108840227127075, "learning_rate": 0.0002, "epoch": 1.0235338064998132, "step": 1370}, {"loss": 1.7194, "grad_norm": 0.32356950640678406, "learning_rate": 0.0002, "epoch": 1.0310048561822935, "step": 1380}, {"loss": 1.7593, "grad_norm": 0.4200669229030609, "learning_rate": 0.0002, "epoch": 1.038475905864774, "step": 1390}, {"loss": 1.797, "grad_norm": 0.3283711373806, "learning_rate": 0.0002, "epoch": 1.0459469555472545, "step": 1400}, {"loss": 1.7163, "grad_norm": 0.32898256182670593, "learning_rate": 0.0002, "epoch": 1.0534180052297348, "step": 1410}, {"loss": 1.7559, "grad_norm": 0.38790300488471985, "learning_rate": 0.0002, "epoch": 1.0608890549122152, "step": 1420}, {"loss": 1.6922, "grad_norm": 0.339800089597702, "learning_rate": 0.0002, "epoch": 1.0683601045946955, "step": 1430}, {"loss": 1.7076, "grad_norm": 0.3548751175403595, "learning_rate": 0.0002, "epoch": 1.075831154277176, "step": 1440}, {"loss": 1.6985, "grad_norm": 0.35114359855651855, "learning_rate": 0.0002, "epoch": 1.0833022039596563, "step": 1450}, {"loss": 1.7217, "grad_norm": 0.35226720571517944, "learning_rate": 0.0002, "epoch": 1.0907732536421366, "step": 1460}, {"loss": 1.6822, "grad_norm": 0.33665576577186584, "learning_rate": 0.0002, "epoch": 1.0982443033246172, "step": 1470}, {"loss": 1.6699, "grad_norm": 0.363889217376709, "learning_rate": 0.0002, "epoch": 1.1057153530070976, "step": 1480}, {"loss": 1.7933, "grad_norm": 0.3826201856136322, "learning_rate": 0.0002, "epoch": 1.113186402689578, "step": 1490}, {"loss": 1.7022, "grad_norm": 0.34058740735054016, "learning_rate": 0.0002, "epoch": 1.1206574523720583, "step": 1500}, {"loss": 1.6375, "grad_norm": 0.3462134301662445, "learning_rate": 0.0002, "epoch": 1.1281285020545386, "step": 1510}, {"loss": 1.7147, "grad_norm": 0.3396756052970886, "learning_rate": 0.0002, "epoch": 1.135599551737019, "step": 1520}, {"loss": 1.7219, "grad_norm": 0.32004743814468384, "learning_rate": 0.0002, "epoch": 1.1430706014194993, "step": 1530}, {"loss": 1.743, "grad_norm": 0.3397733271121979, "learning_rate": 0.0002, "epoch": 1.15054165110198, "step": 1540}, {"loss": 1.7333, "grad_norm": 0.3783262073993683, "learning_rate": 0.0002, "epoch": 1.1580127007844603, "step": 1550}, {"loss": 1.6075, "grad_norm": 0.35121291875839233, "learning_rate": 0.0002, "epoch": 1.1654837504669406, "step": 1560}, {"loss": 1.678, "grad_norm": 0.35816895961761475, "learning_rate": 0.0002, "epoch": 1.172954800149421, "step": 1570}, {"loss": 1.7143, "grad_norm": 0.33843839168548584, "learning_rate": 0.0002, "epoch": 1.1804258498319014, "step": 1580}, {"loss": 1.7434, "grad_norm": 0.3371972143650055, "learning_rate": 0.0002, "epoch": 1.1878968995143817, "step": 1590}, {"loss": 1.7671, "grad_norm": 0.36016878485679626, "learning_rate": 0.0002, "epoch": 1.195367949196862, "step": 1600}, {"loss": 1.6914, "grad_norm": 0.40879473090171814, "learning_rate": 0.0002, "epoch": 1.2028389988793426, "step": 1610}, {"loss": 1.6955, "grad_norm": 0.3216715455055237, "learning_rate": 0.0002, "epoch": 1.210310048561823, "step": 1620}, {"loss": 1.632, "grad_norm": 0.4482610821723938, "learning_rate": 0.0002, "epoch": 1.2177810982443034, "step": 1630}, {"loss": 1.6999, "grad_norm": 0.3257700502872467, "learning_rate": 0.0002, "epoch": 1.2252521479267837, "step": 1640}, {"loss": 1.7177, "grad_norm": 0.38646459579467773, "learning_rate": 0.0002, "epoch": 1.232723197609264, "step": 1650}, {"loss": 1.7081, "grad_norm": 0.4081360697746277, "learning_rate": 0.0002, "epoch": 1.2401942472917444, "step": 1660}, {"loss": 1.7519, "grad_norm": 0.4326848089694977, "learning_rate": 0.0002, "epoch": 1.2476652969742248, "step": 1670}, {"loss": 1.6752, "grad_norm": 0.346401572227478, "learning_rate": 0.0002, "epoch": 1.2551363466567054, "step": 1680}, {"loss": 1.7425, "grad_norm": 0.34536251425743103, "learning_rate": 0.0002, "epoch": 1.2626073963391857, "step": 1690}, {"loss": 1.7061, "grad_norm": 0.41359591484069824, "learning_rate": 0.0002, "epoch": 1.270078446021666, "step": 1700}, {"loss": 1.7906, "grad_norm": 0.3530874252319336, "learning_rate": 0.0002, "epoch": 1.2775494957041464, "step": 1710}, {"loss": 1.7357, "grad_norm": 0.3702719211578369, "learning_rate": 0.0002, "epoch": 1.2850205453866268, "step": 1720}, {"loss": 1.766, "grad_norm": 0.3703329563140869, "learning_rate": 0.0002, "epoch": 1.2924915950691072, "step": 1730}, {"loss": 1.7221, "grad_norm": 0.37919729948043823, "learning_rate": 0.0002, "epoch": 1.2999626447515875, "step": 1740}, {"loss": 1.7859, "grad_norm": 0.32526856660842896, "learning_rate": 0.0002, "epoch": 1.307433694434068, "step": 1750}, {"loss": 1.7117, "grad_norm": 0.36752620339393616, "learning_rate": 0.0002, "epoch": 1.3149047441165485, "step": 1760}, {"loss": 1.7335, "grad_norm": 0.3398192524909973, "learning_rate": 0.0002, "epoch": 1.3223757937990288, "step": 1770}, {"loss": 1.7492, "grad_norm": 0.37435585260391235, "learning_rate": 0.0002, "epoch": 1.3298468434815092, "step": 1780}, {"loss": 1.7393, "grad_norm": 0.35793280601501465, "learning_rate": 0.0002, "epoch": 1.3373178931639895, "step": 1790}, {"loss": 1.7266, "grad_norm": 0.35481882095336914, "learning_rate": 0.0002, "epoch": 1.3447889428464699, "step": 1800}, {"loss": 1.7456, "grad_norm": 0.3786393105983734, "learning_rate": 0.0002, "epoch": 1.3522599925289502, "step": 1810}, {"loss": 1.7169, "grad_norm": 0.33245593309402466, "learning_rate": 0.0002, "epoch": 1.3597310422114308, "step": 1820}, {"loss": 1.7577, "grad_norm": 0.35388344526290894, "learning_rate": 0.0002, "epoch": 1.3672020918939112, "step": 1830}, {"loss": 1.6968, "grad_norm": 0.3695325553417206, "learning_rate": 0.0002, "epoch": 1.3746731415763915, "step": 1840}, {"loss": 1.7086, "grad_norm": 0.3683604598045349, "learning_rate": 0.0002, "epoch": 1.382144191258872, "step": 1850}, {"loss": 1.7878, "grad_norm": 0.3753012418746948, "learning_rate": 0.0002, "epoch": 1.3896152409413522, "step": 1860}, {"loss": 1.6969, "grad_norm": 0.3331069350242615, "learning_rate": 0.0002, "epoch": 1.3970862906238326, "step": 1870}, {"loss": 1.6644, "grad_norm": 0.3877500295639038, "learning_rate": 0.0002, "epoch": 1.404557340306313, "step": 1880}, {"loss": 1.7586, "grad_norm": 0.33525151014328003, "learning_rate": 0.0002, "epoch": 1.4120283899887935, "step": 1890}, {"loss": 1.7031, "grad_norm": 0.3697299659252167, "learning_rate": 0.0002, "epoch": 1.4194994396712737, "step": 1900}, {"loss": 1.6956, "grad_norm": 0.4029286205768585, "learning_rate": 0.0002, "epoch": 1.4269704893537543, "step": 1910}, {"loss": 1.6897, "grad_norm": 0.3596203029155731, "learning_rate": 0.0002, "epoch": 1.4344415390362346, "step": 1920}, {"loss": 1.7139, "grad_norm": 0.450783908367157, "learning_rate": 0.0002, "epoch": 1.441912588718715, "step": 1930}, {"loss": 1.7243, "grad_norm": 0.3651481866836548, "learning_rate": 0.0002, "epoch": 1.4493836384011953, "step": 1940}, {"loss": 1.6637, "grad_norm": 0.3608424663543701, "learning_rate": 0.0002, "epoch": 1.4568546880836757, "step": 1950}, {"loss": 1.8285, "grad_norm": 0.39684420824050903, "learning_rate": 0.0002, "epoch": 1.4643257377661563, "step": 1960}, {"loss": 1.7514, "grad_norm": 0.34618663787841797, "learning_rate": 0.0002, "epoch": 1.4717967874486364, "step": 1970}, {"loss": 1.6655, "grad_norm": 0.4150386452674866, "learning_rate": 0.0002, "epoch": 1.479267837131117, "step": 1980}, {"loss": 1.7021, "grad_norm": 0.35500776767730713, "learning_rate": 0.0002, "epoch": 1.4867388868135973, "step": 1990}, {"loss": 1.7322, "grad_norm": 0.344144344329834, "learning_rate": 0.0002, "epoch": 1.4942099364960777, "step": 2000}, {"loss": 1.6998, "grad_norm": 0.3340149223804474, "learning_rate": 0.0002, "epoch": 1.501680986178558, "step": 2010}, {"loss": 1.7508, "grad_norm": 0.37685006856918335, "learning_rate": 0.0002, "epoch": 1.5091520358610384, "step": 2020}, {"loss": 1.8299, "grad_norm": 0.3699876368045807, "learning_rate": 0.0002, "epoch": 1.516623085543519, "step": 2030}, {"loss": 1.7357, "grad_norm": 0.3370307385921478, "learning_rate": 0.0002, "epoch": 1.5240941352259991, "step": 2040}, {"loss": 1.8044, "grad_norm": 0.37780630588531494, "learning_rate": 0.0002, "epoch": 1.5315651849084797, "step": 2050}, {"loss": 1.7408, "grad_norm": 0.370259165763855, "learning_rate": 0.0002, "epoch": 1.53903623459096, "step": 2060}, {"loss": 1.7398, "grad_norm": 0.3440011441707611, "learning_rate": 0.0002, "epoch": 1.5465072842734404, "step": 2070}, {"loss": 1.7105, "grad_norm": 0.40382063388824463, "learning_rate": 0.0002, "epoch": 1.5539783339559208, "step": 2080}, {"loss": 1.7071, "grad_norm": 0.38002029061317444, "learning_rate": 0.0002, "epoch": 1.5614493836384011, "step": 2090}, {"loss": 1.6815, "grad_norm": 0.3658451437950134, "learning_rate": 0.0002, "epoch": 1.5689204333208817, "step": 2100}, {"loss": 1.7598, "grad_norm": 0.354842871427536, "learning_rate": 0.0002, "epoch": 1.5763914830033618, "step": 2110}, {"loss": 1.6898, "grad_norm": 0.34735530614852905, "learning_rate": 0.0002, "epoch": 1.5838625326858424, "step": 2120}, {"loss": 1.7363, "grad_norm": 0.377581924200058, "learning_rate": 0.0002, "epoch": 1.5913335823683228, "step": 2130}, {"loss": 1.7789, "grad_norm": 0.41254034638404846, "learning_rate": 0.0002, "epoch": 1.5988046320508031, "step": 2140}, {"loss": 1.6782, "grad_norm": 0.3630715310573578, "learning_rate": 0.0002, "epoch": 1.6062756817332835, "step": 2150}, {"loss": 1.7531, "grad_norm": 0.36980143189430237, "learning_rate": 0.0002, "epoch": 1.6137467314157639, "step": 2160}, {"loss": 1.6847, "grad_norm": 0.3634769320487976, "learning_rate": 0.0002, "epoch": 1.6212177810982444, "step": 2170}, {"loss": 1.6367, "grad_norm": 0.3794139623641968, "learning_rate": 0.0002, "epoch": 1.6286888307807246, "step": 2180}, {"loss": 1.7064, "grad_norm": 0.359742134809494, "learning_rate": 0.0002, "epoch": 1.6361598804632052, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3770543932914734, "learning_rate": 0.0002, "epoch": 1.6436309301456855, "step": 2200}, {"loss": 1.784, "grad_norm": 0.3797036409378052, "learning_rate": 0.0002, "epoch": 1.6511019798281659, "step": 2210}, {"loss": 1.7875, "grad_norm": 0.35622093081474304, "learning_rate": 0.0002, "epoch": 1.6585730295106462, "step": 2220}, {"loss": 1.6615, "grad_norm": 0.34552520513534546, "learning_rate": 0.0002, "epoch": 1.6660440791931266, "step": 2230}, {"loss": 1.7522, "grad_norm": 0.379926860332489, "learning_rate": 0.0002, "epoch": 1.6735151288756072, "step": 2240}, {"loss": 1.7953, "grad_norm": 0.37083810567855835, "learning_rate": 0.0002, "epoch": 1.6809861785580873, "step": 2250}, {"loss": 1.7485, "grad_norm": 0.42746543884277344, "learning_rate": 0.0002, "epoch": 1.6884572282405679, "step": 2260}, {"loss": 1.776, "grad_norm": 0.3372884690761566, "learning_rate": 0.0002, "epoch": 1.6959282779230482, "step": 2270}, {"loss": 1.7604, "grad_norm": 0.35220256447792053, "learning_rate": 0.0002, "epoch": 1.7033993276055286, "step": 2280}, {"loss": 1.7154, "grad_norm": 0.3659130930900574, "learning_rate": 0.0002, "epoch": 1.710870377288009, "step": 2290}, {"loss": 1.6953, "grad_norm": 0.37629297375679016, "learning_rate": 0.0002, "epoch": 1.7183414269704893, "step": 2300}, {"loss": 1.7212, "grad_norm": 0.36312398314476013, "learning_rate": 0.0002, "epoch": 1.7258124766529699, "step": 2310}, {"loss": 1.7903, "grad_norm": 0.467709481716156, "learning_rate": 0.0002, "epoch": 1.73328352633545, "step": 2320}, {"loss": 1.696, "grad_norm": 0.38685527443885803, "learning_rate": 0.0002, "epoch": 1.7407545760179306, "step": 2330}, {"loss": 1.7041, "grad_norm": 0.3578338325023651, "learning_rate": 0.0002, "epoch": 1.748225625700411, "step": 2340}, {"loss": 1.6456, "grad_norm": 0.36057502031326294, "learning_rate": 0.0002, "epoch": 1.7556966753828913, "step": 2350}, {"loss": 1.6853, "grad_norm": 0.3615196645259857, "learning_rate": 0.0002, "epoch": 1.7631677250653717, "step": 2360}, {"loss": 1.7612, "grad_norm": 0.4118947684764862, "learning_rate": 0.0002, "epoch": 1.770638774747852, "step": 2370}, {"loss": 1.6946, "grad_norm": 0.4067276120185852, "learning_rate": 0.0002, "epoch": 1.7781098244303326, "step": 2380}, {"loss": 1.712, "grad_norm": 0.3979823887348175, "learning_rate": 0.0002, "epoch": 1.7855808741128127, "step": 2390}, {"loss": 1.7644, "grad_norm": 0.44045883417129517, "learning_rate": 0.0002, "epoch": 1.7930519237952933, "step": 2400}, {"loss": 1.7251, "grad_norm": 0.3998069167137146, "learning_rate": 0.0002, "epoch": 1.8005229734777737, "step": 2410}, {"loss": 1.7354, "grad_norm": 0.3450094759464264, "learning_rate": 0.0002, "epoch": 1.807994023160254, "step": 2420}, {"loss": 1.6998, "grad_norm": 0.3759009838104248, "learning_rate": 0.0002, "epoch": 1.8154650728427344, "step": 2430}, {"loss": 1.7706, "grad_norm": 0.34347015619277954, "learning_rate": 0.0002, "epoch": 1.8229361225252148, "step": 2440}, {"loss": 1.7345, "grad_norm": 0.3511228859424591, "learning_rate": 0.0002, "epoch": 1.8304071722076953, "step": 2450}, {"loss": 1.6909, "grad_norm": 0.36853715777397156, "learning_rate": 0.0002, "epoch": 1.8378782218901755, "step": 2460}, {"loss": 1.6931, "grad_norm": 0.40659376978874207, "learning_rate": 0.0002, "epoch": 1.845349271572656, "step": 2470}, {"loss": 1.7626, "grad_norm": 0.39621320366859436, "learning_rate": 0.0002, "epoch": 1.8528203212551362, "step": 2480}, {"loss": 1.7427, "grad_norm": 0.3753979504108429, "learning_rate": 0.0002, "epoch": 1.8602913709376168, "step": 2490}, {"loss": 1.6622, "grad_norm": 0.3811938464641571, "learning_rate": 0.0002, "epoch": 1.8677624206200971, "step": 2500}, {"loss": 1.7718, "grad_norm": 0.3432596027851105, "learning_rate": 0.0002, "epoch": 1.8752334703025775, "step": 2510}, {"loss": 1.7488, "grad_norm": 0.3670712113380432, "learning_rate": 0.0002, "epoch": 1.882704519985058, "step": 2520}, {"loss": 1.705, "grad_norm": 0.40907177329063416, "learning_rate": 0.0002, "epoch": 1.8901755696675382, "step": 2530}, {"loss": 1.7148, "grad_norm": 0.3821999728679657, "learning_rate": 0.0002, "epoch": 1.8976466193500188, "step": 2540}, {"loss": 1.7934, "grad_norm": 0.36173978447914124, "learning_rate": 0.0002, "epoch": 1.905117669032499, "step": 2550}, {"loss": 1.6939, "grad_norm": 0.38990336656570435, "learning_rate": 0.0002, "epoch": 1.9125887187149795, "step": 2560}, {"loss": 1.6893, "grad_norm": 0.35242322087287903, "learning_rate": 0.0002, "epoch": 1.9200597683974598, "step": 2570}, {"loss": 1.7268, "grad_norm": 0.3506428003311157, "learning_rate": 0.0002, "epoch": 1.9275308180799402, "step": 2580}, {"loss": 1.6953, "grad_norm": 0.39540135860443115, "learning_rate": 0.0002, "epoch": 1.9350018677624208, "step": 2590}, {"loss": 1.6511, "grad_norm": 0.3444725573062897, "learning_rate": 0.0002, "epoch": 1.942472917444901, "step": 2600}, {"loss": 1.7259, "grad_norm": 0.3963521718978882, "learning_rate": 0.0002, "epoch": 1.9499439671273815, "step": 2610}, {"loss": 1.6946, "grad_norm": 0.3689815402030945, "learning_rate": 0.0002, "epoch": 1.9574150168098616, "step": 2620}, {"loss": 1.7384, "grad_norm": 0.3482626676559448, "learning_rate": 0.0002, "epoch": 1.9648860664923422, "step": 2630}, {"loss": 1.7048, "grad_norm": 0.35832616686820984, "learning_rate": 0.0002, "epoch": 1.9723571161748226, "step": 2640}, {"loss": 1.6681, "grad_norm": 0.4776208996772766, "learning_rate": 0.0002, "epoch": 1.979828165857303, "step": 2650}, {"loss": 1.6696, "grad_norm": 0.32570165395736694, "learning_rate": 0.0002, "epoch": 1.9872992155397835, "step": 2660}, {"loss": 1.7232, "grad_norm": 0.3380725085735321, "learning_rate": 0.0002, "epoch": 1.9947702652222636, "step": 2670}, {"eval_loss": 1.8046749830245972, "eval_runtime": 38.5096, "eval_samples_per_second": 13.373, "eval_steps_per_second": 1.688, "epoch": 2.0, "step": 2677}, {"loss": 1.7265, "grad_norm": 0.36817631125450134, "learning_rate": 0.0002, "epoch": 2.002241314904744, "step": 2680}, {"loss": 1.548, "grad_norm": 0.4056456685066223, "learning_rate": 0.0002, "epoch": 2.0097123645872244, "step": 2690}, {"loss": 1.5515, "grad_norm": 0.37416863441467285, "learning_rate": 0.0002, "epoch": 2.017183414269705, "step": 2700}, {"loss": 1.5895, "grad_norm": 0.4273638427257538, "learning_rate": 0.0002, "epoch": 2.024654463952185, "step": 2710}, {"loss": 1.5884, "grad_norm": 0.36497923731803894, "learning_rate": 0.0002, "epoch": 2.0321255136346656, "step": 2720}, {"loss": 1.6999, "grad_norm": 0.5021994113922119, "learning_rate": 0.0002, "epoch": 2.0395965633171462, "step": 2730}, {"loss": 1.6655, "grad_norm": 0.45896220207214355, "learning_rate": 0.0002, "epoch": 2.0470676129996264, "step": 2740}, {"loss": 1.6305, "grad_norm": 0.3973815143108368, "learning_rate": 0.0002, "epoch": 2.054538662682107, "step": 2750}, {"loss": 1.6301, "grad_norm": 0.4521815776824951, "learning_rate": 0.0002, "epoch": 2.062009712364587, "step": 2760}, {"loss": 1.6189, "grad_norm": 0.42775002121925354, "learning_rate": 0.0002, "epoch": 2.0694807620470677, "step": 2770}, {"loss": 1.6491, "grad_norm": 0.48158586025238037, "learning_rate": 0.0002, "epoch": 2.076951811729548, "step": 2780}, {"loss": 1.6301, "grad_norm": 0.4612371623516083, "learning_rate": 0.0002, "epoch": 2.0844228614120284, "step": 2790}, {"loss": 1.6327, "grad_norm": 0.42536866664886475, "learning_rate": 0.0002, "epoch": 2.091893911094509, "step": 2800}, {"loss": 1.651, "grad_norm": 0.48515772819519043, "learning_rate": 0.0002, "epoch": 2.099364960776989, "step": 2810}, {"loss": 1.6829, "grad_norm": 0.41418662667274475, "learning_rate": 0.0002, "epoch": 2.1068360104594697, "step": 2820}, {"loss": 1.6266, "grad_norm": 0.4683697819709778, "learning_rate": 0.0002, "epoch": 2.11430706014195, "step": 2830}, {"loss": 1.6586, "grad_norm": 0.4484657049179077, "learning_rate": 0.0002, "epoch": 2.1217781098244304, "step": 2840}, {"loss": 1.6483, "grad_norm": 0.6621400713920593, "learning_rate": 0.0002, "epoch": 2.1292491595069105, "step": 2850}, {"loss": 1.5755, "grad_norm": 0.45074811577796936, "learning_rate": 0.0002, "epoch": 2.136720209189391, "step": 2860}, {"loss": 1.6456, "grad_norm": 0.3513113558292389, "learning_rate": 0.0002, "epoch": 2.1441912588718717, "step": 2870}, {"loss": 1.6081, "grad_norm": 0.40411314368247986, "learning_rate": 0.0002, "epoch": 2.151662308554352, "step": 2880}, {"loss": 1.6323, "grad_norm": 0.4121065139770508, "learning_rate": 0.0002, "epoch": 2.1591333582368324, "step": 2890}, {"loss": 1.6324, "grad_norm": 0.44723689556121826, "learning_rate": 0.0002, "epoch": 2.1666044079193125, "step": 2900}, {"loss": 1.5699, "grad_norm": 0.4226122498512268, "learning_rate": 0.0002, "epoch": 2.174075457601793, "step": 2910}, {"loss": 1.5652, "grad_norm": 0.46617650985717773, "learning_rate": 0.0002, "epoch": 2.1815465072842732, "step": 2920}, {"loss": 1.6378, "grad_norm": 0.4506422281265259, "learning_rate": 0.0002, "epoch": 2.189017556966754, "step": 2930}, {"loss": 1.6112, "grad_norm": 0.4892672896385193, "learning_rate": 0.0002, "epoch": 2.1964886066492344, "step": 2940}, {"loss": 1.6176, "grad_norm": 0.44095516204833984, "learning_rate": 0.0002, "epoch": 2.2039596563317145, "step": 2950}, {"loss": 1.6058, "grad_norm": 0.41522109508514404, "learning_rate": 0.0002, "epoch": 2.211430706014195, "step": 2960}, {"loss": 1.5964, "grad_norm": 0.4860858917236328, "learning_rate": 0.0002, "epoch": 2.2189017556966752, "step": 2970}, {"loss": 1.6427, "grad_norm": 0.42662516236305237, "learning_rate": 0.0002, "epoch": 2.226372805379156, "step": 2980}, {"loss": 1.6313, "grad_norm": 0.4390648305416107, "learning_rate": 0.0002, "epoch": 2.233843855061636, "step": 2990}, {"loss": 1.5992, "grad_norm": 0.47515565156936646, "learning_rate": 0.0002, "epoch": 2.2413149047441165, "step": 3000}, {"loss": 1.5563, "grad_norm": 0.4104543924331665, "learning_rate": 0.0002, "epoch": 2.248785954426597, "step": 3010}, {"loss": 1.6895, "grad_norm": 0.4404028654098511, "learning_rate": 0.0002, "epoch": 2.2562570041090773, "step": 3020}, {"loss": 1.6088, "grad_norm": 0.4717366695404053, "learning_rate": 0.0002, "epoch": 2.263728053791558, "step": 3030}, {"loss": 1.7287, "grad_norm": 0.48345857858657837, "learning_rate": 0.0002, "epoch": 2.271199103474038, "step": 3040}, {"loss": 1.681, "grad_norm": 0.5312452912330627, "learning_rate": 0.0002, "epoch": 2.2786701531565186, "step": 3050}, {"loss": 1.5901, "grad_norm": 0.5073099732398987, "learning_rate": 0.0002, "epoch": 2.2861412028389987, "step": 3060}, {"loss": 1.6914, "grad_norm": 0.5027463436126709, "learning_rate": 0.0002, "epoch": 2.2936122525214793, "step": 3070}, {"loss": 1.5862, "grad_norm": 0.5436304807662964, "learning_rate": 0.0002, "epoch": 2.30108330220396, "step": 3080}, {"loss": 1.5763, "grad_norm": 0.4701065123081207, "learning_rate": 0.0002, "epoch": 2.30855435188644, "step": 3090}, {"loss": 1.6177, "grad_norm": 0.46988746523857117, "learning_rate": 0.0002, "epoch": 2.3160254015689206, "step": 3100}, {"loss": 1.6502, "grad_norm": 0.45112869143486023, "learning_rate": 0.0002, "epoch": 2.3234964512514007, "step": 3110}, {"loss": 1.6291, "grad_norm": 0.5173566937446594, "learning_rate": 0.0002, "epoch": 2.3309675009338813, "step": 3120}, {"loss": 1.6743, "grad_norm": 0.40345850586891174, "learning_rate": 0.0002, "epoch": 2.3384385506163614, "step": 3130}, {"loss": 1.621, "grad_norm": 0.4218924939632416, "learning_rate": 0.0002, "epoch": 2.345909600298842, "step": 3140}, {"loss": 1.6341, "grad_norm": 0.41857317090034485, "learning_rate": 0.0002, "epoch": 2.3533806499813226, "step": 3150}, {"loss": 1.6087, "grad_norm": 0.4197218418121338, "learning_rate": 0.0002, "epoch": 2.3608516996638027, "step": 3160}, {"loss": 1.6572, "grad_norm": 0.4260677397251129, "learning_rate": 0.0002, "epoch": 2.3683227493462833, "step": 3170}, {"loss": 1.6376, "grad_norm": 0.4209042191505432, "learning_rate": 0.0002, "epoch": 2.3757937990287634, "step": 3180}, {"loss": 1.634, "grad_norm": 0.4092234969139099, "learning_rate": 0.0002, "epoch": 2.383264848711244, "step": 3190}, {"loss": 1.6339, "grad_norm": 0.4928431510925293, "learning_rate": 0.0002, "epoch": 2.390735898393724, "step": 3200}, {"loss": 1.6015, "grad_norm": 0.49252402782440186, "learning_rate": 0.0002, "epoch": 2.3982069480762047, "step": 3210}, {"loss": 1.5773, "grad_norm": 0.4368397295475006, "learning_rate": 0.0002, "epoch": 2.4056779977586853, "step": 3220}, {"loss": 1.6629, "grad_norm": 0.46122390031814575, "learning_rate": 0.0002, "epoch": 2.4131490474411654, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4272301197052002, "learning_rate": 0.0002, "epoch": 2.420620097123646, "step": 3240}, {"loss": 1.5961, "grad_norm": 0.41480937600135803, "learning_rate": 0.0002, "epoch": 2.428091146806126, "step": 3250}, {"loss": 1.6281, "grad_norm": 0.48911941051483154, "learning_rate": 0.0002, "epoch": 2.4355621964886067, "step": 3260}, {"loss": 1.6846, "grad_norm": 0.4444098472595215, "learning_rate": 0.0002, "epoch": 2.443033246171087, "step": 3270}, {"loss": 1.6961, "grad_norm": 0.5111684799194336, "learning_rate": 0.0002, "epoch": 2.4505042958535674, "step": 3280}, {"loss": 1.6152, "grad_norm": 0.5058825016021729, "learning_rate": 0.0002, "epoch": 2.457975345536048, "step": 3290}, {"loss": 1.625, "grad_norm": 0.44173210859298706, "learning_rate": 0.0002, "epoch": 2.465446395218528, "step": 3300}, {"loss": 1.6491, "grad_norm": 0.4659745991230011, "learning_rate": 0.0002, "epoch": 2.4729174449010087, "step": 3310}, {"loss": 1.6114, "grad_norm": 0.47237497568130493, "learning_rate": 0.0002, "epoch": 2.480388494583489, "step": 3320}, {"loss": 1.6193, "grad_norm": 0.47303131222724915, "learning_rate": 0.0002, "epoch": 2.4878595442659694, "step": 3330}, {"loss": 1.7256, "grad_norm": 0.4522389769554138, "learning_rate": 0.0002, "epoch": 2.4953305939484496, "step": 3340}, {"loss": 1.6834, "grad_norm": 0.4467332363128662, "learning_rate": 0.0002, "epoch": 2.50280164363093, "step": 3350}, {"loss": 1.6108, "grad_norm": 0.4413762092590332, "learning_rate": 0.0002, "epoch": 2.5102726933134107, "step": 3360}, {"loss": 1.537, "grad_norm": 0.495514452457428, "learning_rate": 0.0002, "epoch": 2.517743742995891, "step": 3370}, {"loss": 1.5839, "grad_norm": 0.4429773986339569, "learning_rate": 0.0002, "epoch": 2.5252147926783715, "step": 3380}, {"loss": 1.6522, "grad_norm": 0.4589079022407532, "learning_rate": 0.0002, "epoch": 2.5326858423608516, "step": 3390}, {"loss": 1.6529, "grad_norm": 0.4683997333049774, "learning_rate": 0.0002, "epoch": 2.540156892043332, "step": 3400}, {"loss": 1.6745, "grad_norm": 0.4651731252670288, "learning_rate": 0.0002, "epoch": 2.5476279417258123, "step": 3410}, {"loss": 1.5918, "grad_norm": 0.45818084478378296, "learning_rate": 0.0002, "epoch": 2.555098991408293, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.45209529995918274, "learning_rate": 0.0002, "epoch": 2.5625700410907735, "step": 3430}, {"loss": 1.5606, "grad_norm": 0.4344733655452728, "learning_rate": 0.0002, "epoch": 2.5700410907732536, "step": 3440}, {"loss": 1.6748, "grad_norm": 0.47435566782951355, "learning_rate": 0.0002, "epoch": 2.577512140455734, "step": 3450}, {"loss": 1.6237, "grad_norm": 0.43841999769210815, "learning_rate": 0.0002, "epoch": 2.5849831901382143, "step": 3460}, {"loss": 1.7207, "grad_norm": 0.4323869049549103, "learning_rate": 0.0002, "epoch": 2.592454239820695, "step": 3470}, {"loss": 1.5494, "grad_norm": 0.44355881214141846, "learning_rate": 0.0002, "epoch": 2.599925289503175, "step": 3480}, {"loss": 1.665, "grad_norm": 0.45847779512405396, "learning_rate": 0.0002, "epoch": 2.6073963391856556, "step": 3490}, {"loss": 1.6006, "grad_norm": 0.4411061704158783, "learning_rate": 0.0002, "epoch": 2.614867388868136, "step": 3500}, {"loss": 1.5868, "grad_norm": 0.4446796178817749, "learning_rate": 0.0002, "epoch": 2.6223384385506163, "step": 3510}, {"loss": 1.5946, "grad_norm": 0.41969653964042664, "learning_rate": 0.0002, "epoch": 2.629809488233097, "step": 3520}, {"loss": 1.6798, "grad_norm": 0.5263747572898865, "learning_rate": 0.0002, "epoch": 2.637280537915577, "step": 3530}, {"loss": 1.6309, "grad_norm": 0.47719451785087585, "learning_rate": 0.0002, "epoch": 2.6447515875980576, "step": 3540}, {"loss": 1.7024, "grad_norm": 0.46574118733406067, "learning_rate": 0.0002, "epoch": 2.6522226372805378, "step": 3550}, {"loss": 1.618, "grad_norm": 0.46867135167121887, "learning_rate": 0.0002, "epoch": 2.6596936869630183, "step": 3560}, {"loss": 1.5885, "grad_norm": 0.4441198706626892, "learning_rate": 0.0002, "epoch": 2.667164736645499, "step": 3570}, {"loss": 1.6426, "grad_norm": 0.4871319830417633, "learning_rate": 0.0002, "epoch": 2.674635786327979, "step": 3580}, {"loss": 1.6575, "grad_norm": 0.43900373578071594, "learning_rate": 0.0002, "epoch": 2.6821068360104596, "step": 3590}, {"loss": 1.6071, "grad_norm": 0.42509549856185913, "learning_rate": 0.0002, "epoch": 2.6895778856929398, "step": 3600}, {"loss": 1.5651, "grad_norm": 0.4691086709499359, "learning_rate": 0.0002, "epoch": 2.6970489353754203, "step": 3610}, {"loss": 1.5491, "grad_norm": 0.46318942308425903, "learning_rate": 0.0002, "epoch": 2.7045199850579005, "step": 3620}, {"loss": 1.5422, "grad_norm": 0.44631096720695496, "learning_rate": 0.0002, "epoch": 2.711991034740381, "step": 3630}, {"loss": 1.6831, "grad_norm": 0.42315489053726196, "learning_rate": 0.0002, "epoch": 2.7194620844228616, "step": 3640}, {"loss": 1.6008, "grad_norm": 0.4971241056919098, "learning_rate": 0.0002, "epoch": 2.7269331341053418, "step": 3650}, {"loss": 1.6042, "grad_norm": 0.4578486382961273, "learning_rate": 0.0002, "epoch": 2.7344041837878224, "step": 3660}, {"loss": 1.6076, "grad_norm": 0.46584776043891907, "learning_rate": 0.0002, "epoch": 2.7418752334703025, "step": 3670}, {"loss": 1.6809, "grad_norm": 0.4951731264591217, "learning_rate": 0.0002, "epoch": 2.749346283152783, "step": 3680}, {"loss": 1.6226, "grad_norm": 0.4935225546360016, "learning_rate": 0.0002, "epoch": 2.756817332835263, "step": 3690}, {"loss": 1.5878, "grad_norm": 0.41805586218833923, "learning_rate": 0.0002, "epoch": 2.764288382517744, "step": 3700}, {"loss": 1.7173, "grad_norm": 0.4417555630207062, "learning_rate": 0.0002, "epoch": 2.7717594322002244, "step": 3710}, {"loss": 1.6398, "grad_norm": 0.48229655623435974, "learning_rate": 0.0002, "epoch": 2.7792304818827045, "step": 3720}, {"loss": 1.6074, "grad_norm": 0.48562315106391907, "learning_rate": 0.0002, "epoch": 2.786701531565185, "step": 3730}, {"loss": 1.607, "grad_norm": 0.4473940432071686, "learning_rate": 0.0002, "epoch": 2.794172581247665, "step": 3740}, {"loss": 1.6065, "grad_norm": 0.4626813232898712, "learning_rate": 0.0002, "epoch": 2.801643630930146, "step": 3750}, {"loss": 1.6296, "grad_norm": 0.4339792728424072, "learning_rate": 0.0002, "epoch": 2.809114680612626, "step": 3760}, {"loss": 1.6815, "grad_norm": 0.5250858068466187, "learning_rate": 0.0002, "epoch": 2.8165857302951065, "step": 3770}, {"loss": 1.6644, "grad_norm": 0.4537523090839386, "learning_rate": 0.0002, "epoch": 2.824056779977587, "step": 3780}, {"loss": 1.6535, "grad_norm": 0.5646113157272339, "learning_rate": 0.0002, "epoch": 2.831527829660067, "step": 3790}, {"loss": 1.5712, "grad_norm": 0.44243332743644714, "learning_rate": 0.0002, "epoch": 2.8389988793425474, "step": 3800}, {"loss": 1.6478, "grad_norm": 0.4585791826248169, "learning_rate": 0.0002, "epoch": 2.846469929025028, "step": 3810}, {"loss": 1.6854, "grad_norm": 0.489702045917511, "learning_rate": 0.0002, "epoch": 2.8539409787075085, "step": 3820}, {"loss": 1.7066, "grad_norm": 0.502470850944519, "learning_rate": 0.0002, "epoch": 2.8614120283899886, "step": 3830}, {"loss": 1.5785, "grad_norm": 0.4395960867404938, "learning_rate": 0.0002, "epoch": 2.8688830780724692, "step": 3840}, {"loss": 1.6434, "grad_norm": 0.4348670244216919, "learning_rate": 0.0002, "epoch": 2.87635412775495, "step": 3850}, {"loss": 1.6163, "grad_norm": 0.48852720856666565, "learning_rate": 0.0002, "epoch": 2.88382517743743, "step": 3860}, {"loss": 1.5916, "grad_norm": 0.45317450165748596, "learning_rate": 0.0002, "epoch": 2.89129622711991, "step": 3870}, {"loss": 1.6486, "grad_norm": 0.4732758700847626, "learning_rate": 0.0002, "epoch": 2.8987672768023907, "step": 3880}, {"loss": 1.6758, "grad_norm": 0.45238012075424194, "learning_rate": 0.0002, "epoch": 2.9062383264848712, "step": 3890}, {"loss": 1.6228, "grad_norm": 0.48838064074516296, "learning_rate": 0.0002, "epoch": 2.9137093761673514, "step": 3900}, {"loss": 1.658, "grad_norm": 0.43496349453926086, "learning_rate": 0.0002, "epoch": 2.921180425849832, "step": 3910}, {"loss": 1.7063, "grad_norm": 0.47963935136795044, "learning_rate": 0.0002, "epoch": 2.9286514755323125, "step": 3920}, {"loss": 1.6553, "grad_norm": 0.4544987976551056, "learning_rate": 0.0002, "epoch": 2.9361225252147927, "step": 3930}, {"loss": 1.6192, "grad_norm": 0.4622892141342163, "learning_rate": 0.0002, "epoch": 2.943593574897273, "step": 3940}, {"loss": 1.6178, "grad_norm": 0.47026222944259644, "learning_rate": 0.0002, "epoch": 2.9510646245797534, "step": 3950}, {"loss": 1.6612, "grad_norm": 0.4549552798271179, "learning_rate": 0.0002, "epoch": 2.958535674262234, "step": 3960}, {"loss": 1.6458, "grad_norm": 0.46647515892982483, "learning_rate": 0.0002, "epoch": 2.966006723944714, "step": 3970}, {"loss": 1.6051, "grad_norm": 0.45095112919807434, "learning_rate": 0.0002, "epoch": 2.9734777736271947, "step": 3980}, {"loss": 1.6471, "grad_norm": 0.4690017104148865, "learning_rate": 0.0002, "epoch": 2.9809488233096753, "step": 3990}, {"loss": 1.6061, "grad_norm": 0.4603444039821625, "learning_rate": 0.0002, "epoch": 2.9884198729921554, "step": 4000}, {"loss": 1.6431, "grad_norm": 0.4743294417858124, "learning_rate": 0.0002, "epoch": 2.9958909226746355, "step": 4010}]} +{"epoch": 4.0, "step": 5354, "epoch_duration": 1510.5811264514923, "total_accumulated_duration": 5823.530160188675, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6181, "grad_norm": 0.4912872612476349, "learning_rate": 0.0002, "epoch": 0.007471049682480389, "step": 10}, {"loss": 2.2606, "grad_norm": 0.4856316149234772, "learning_rate": 0.0002, "epoch": 0.014942099364960777, "step": 20}, {"loss": 2.0957, "grad_norm": 0.47683125734329224, "learning_rate": 0.0002, "epoch": 0.022413149047441166, "step": 30}, {"loss": 1.8908, "grad_norm": 0.515082597732544, "learning_rate": 0.0002, "epoch": 0.029884198729921554, "step": 40}, {"loss": 1.9704, "grad_norm": 0.5299215316772461, "learning_rate": 0.0002, "epoch": 0.03735524841240194, "step": 50}, {"loss": 1.9225, "grad_norm": 0.4951399862766266, "learning_rate": 0.0002, "epoch": 0.04482629809488233, "step": 60}, {"loss": 1.9742, "grad_norm": 0.48079821467399597, "learning_rate": 0.0002, "epoch": 0.05229734777736272, "step": 70}, {"loss": 1.9466, "grad_norm": 0.49402132630348206, "learning_rate": 0.0002, "epoch": 0.05976839745984311, "step": 80}, {"loss": 1.8691, "grad_norm": 0.4778193235397339, "learning_rate": 0.0002, "epoch": 0.0672394471423235, "step": 90}, {"loss": 1.8455, "grad_norm": 0.42472657561302185, "learning_rate": 0.0002, "epoch": 0.07471049682480388, "step": 100}, {"loss": 1.8744, "grad_norm": 0.4433092474937439, "learning_rate": 0.0002, "epoch": 0.08218154650728428, "step": 110}, {"loss": 1.865, "grad_norm": 0.4472862780094147, "learning_rate": 0.0002, "epoch": 0.08965259618976466, "step": 120}, {"loss": 1.9256, "grad_norm": 0.42596298456192017, "learning_rate": 0.0002, "epoch": 0.09712364587224505, "step": 130}, {"loss": 1.8015, "grad_norm": 0.46645811200141907, "learning_rate": 0.0002, "epoch": 0.10459469555472543, "step": 140}, {"loss": 1.8307, "grad_norm": 0.41041234135627747, "learning_rate": 0.0002, "epoch": 0.11206574523720583, "step": 150}, {"loss": 1.8276, "grad_norm": 0.5329819917678833, "learning_rate": 0.0002, "epoch": 0.11953679491968622, "step": 160}, {"loss": 1.8118, "grad_norm": 0.4065922200679779, "learning_rate": 0.0002, "epoch": 0.1270078446021666, "step": 170}, {"loss": 1.8559, "grad_norm": 0.38406994938850403, "learning_rate": 0.0002, "epoch": 0.134478894284647, "step": 180}, {"loss": 1.8647, "grad_norm": 0.4246881306171417, "learning_rate": 0.0002, "epoch": 0.14194994396712737, "step": 190}, {"loss": 1.8054, "grad_norm": 0.35136649012565613, "learning_rate": 0.0002, "epoch": 0.14942099364960776, "step": 200}, {"loss": 1.802, "grad_norm": 0.43252742290496826, "learning_rate": 0.0002, "epoch": 0.15689204333208817, "step": 210}, {"loss": 1.7823, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002, "epoch": 0.16436309301456856, "step": 220}, {"loss": 1.818, "grad_norm": 0.3748249113559723, "learning_rate": 0.0002, "epoch": 0.17183414269704894, "step": 230}, {"loss": 1.866, "grad_norm": 0.6432855725288391, "learning_rate": 0.0002, "epoch": 0.17930519237952933, "step": 240}, {"loss": 1.8397, "grad_norm": 0.34874802827835083, "learning_rate": 0.0002, "epoch": 0.1867762420620097, "step": 250}, {"loss": 1.79, "grad_norm": 0.3721984326839447, "learning_rate": 0.0002, "epoch": 0.1942472917444901, "step": 260}, {"loss": 1.8464, "grad_norm": 0.4339311420917511, "learning_rate": 0.0002, "epoch": 0.20171834142697048, "step": 270}, {"loss": 1.8665, "grad_norm": 0.4018215537071228, "learning_rate": 0.0002, "epoch": 0.20918939110945087, "step": 280}, {"loss": 1.8048, "grad_norm": 0.3278839886188507, "learning_rate": 0.0002, "epoch": 0.21666044079193125, "step": 290}, {"loss": 1.7395, "grad_norm": 0.36146077513694763, "learning_rate": 0.0002, "epoch": 0.22413149047441167, "step": 300}, {"loss": 1.7916, "grad_norm": 0.38175010681152344, "learning_rate": 0.0002, "epoch": 0.23160254015689205, "step": 310}, {"loss": 1.8593, "grad_norm": 0.44776618480682373, "learning_rate": 0.0002, "epoch": 0.23907358983937244, "step": 320}, {"loss": 1.7824, "grad_norm": 0.3933652937412262, "learning_rate": 0.0002, "epoch": 0.24654463952185282, "step": 330}, {"loss": 1.8393, "grad_norm": 0.3515005111694336, "learning_rate": 0.0002, "epoch": 0.2540156892043332, "step": 340}, {"loss": 1.8653, "grad_norm": 0.6683304309844971, "learning_rate": 0.0002, "epoch": 0.2614867388868136, "step": 350}, {"loss": 1.8797, "grad_norm": 0.37093454599380493, "learning_rate": 0.0002, "epoch": 0.268957788569294, "step": 360}, {"loss": 1.8251, "grad_norm": 0.3450651168823242, "learning_rate": 0.0002, "epoch": 0.2764288382517744, "step": 370}, {"loss": 1.7435, "grad_norm": 0.5140917301177979, "learning_rate": 0.0002, "epoch": 0.28389988793425475, "step": 380}, {"loss": 1.8026, "grad_norm": 0.32885563373565674, "learning_rate": 0.0002, "epoch": 0.29137093761673516, "step": 390}, {"loss": 1.8174, "grad_norm": 0.33962297439575195, "learning_rate": 0.0002, "epoch": 0.2988419872992155, "step": 400}, {"loss": 1.7467, "grad_norm": 0.3723141849040985, "learning_rate": 0.0002, "epoch": 0.30631303698169593, "step": 410}, {"loss": 1.8459, "grad_norm": 0.37173134088516235, "learning_rate": 0.0002, "epoch": 0.31378408666417634, "step": 420}, {"loss": 1.8876, "grad_norm": 0.33736956119537354, "learning_rate": 0.0002, "epoch": 0.3212551363466567, "step": 430}, {"loss": 1.8367, "grad_norm": 0.3602448105812073, "learning_rate": 0.0002, "epoch": 0.3287261860291371, "step": 440}, {"loss": 1.8058, "grad_norm": 0.3569699227809906, "learning_rate": 0.0002, "epoch": 0.33619723571161747, "step": 450}, {"loss": 1.8086, "grad_norm": 0.31009167432785034, "learning_rate": 0.0002, "epoch": 0.3436682853940979, "step": 460}, {"loss": 1.8876, "grad_norm": 0.5278693437576294, "learning_rate": 0.0002, "epoch": 0.35113933507657824, "step": 470}, {"loss": 1.8534, "grad_norm": 0.3587537109851837, "learning_rate": 0.0002, "epoch": 0.35861038475905865, "step": 480}, {"loss": 1.8046, "grad_norm": 0.3859670162200928, "learning_rate": 0.0002, "epoch": 0.366081434441539, "step": 490}, {"loss": 1.8287, "grad_norm": 0.395913690328598, "learning_rate": 0.0002, "epoch": 0.3735524841240194, "step": 500}, {"loss": 1.7619, "grad_norm": 0.35052940249443054, "learning_rate": 0.0002, "epoch": 0.38102353380649984, "step": 510}, {"loss": 1.7824, "grad_norm": 0.2979494333267212, "learning_rate": 0.0002, "epoch": 0.3884945834889802, "step": 520}, {"loss": 1.8641, "grad_norm": 0.3062683343887329, "learning_rate": 0.0002, "epoch": 0.3959656331714606, "step": 530}, {"loss": 1.7651, "grad_norm": 0.3172847330570221, "learning_rate": 0.0002, "epoch": 0.40343668285394096, "step": 540}, {"loss": 1.806, "grad_norm": 0.360435426235199, "learning_rate": 0.0002, "epoch": 0.4109077325364214, "step": 550}, {"loss": 1.9054, "grad_norm": 0.3427872359752655, "learning_rate": 0.0002, "epoch": 0.41837878221890173, "step": 560}, {"loss": 1.7562, "grad_norm": 0.34036558866500854, "learning_rate": 0.0002, "epoch": 0.42584983190138215, "step": 570}, {"loss": 1.7254, "grad_norm": 0.3365345299243927, "learning_rate": 0.0002, "epoch": 0.4333208815838625, "step": 580}, {"loss": 1.8328, "grad_norm": 0.35619041323661804, "learning_rate": 0.0002, "epoch": 0.4407919312663429, "step": 590}, {"loss": 1.8114, "grad_norm": 0.3569088280200958, "learning_rate": 0.0002, "epoch": 0.44826298094882333, "step": 600}, {"loss": 1.8599, "grad_norm": 0.3581278622150421, "learning_rate": 0.0002, "epoch": 0.4557340306313037, "step": 610}, {"loss": 1.7078, "grad_norm": 0.43197110295295715, "learning_rate": 0.0002, "epoch": 0.4632050803137841, "step": 620}, {"loss": 1.8257, "grad_norm": 0.33966198563575745, "learning_rate": 0.0002, "epoch": 0.47067612999626446, "step": 630}, {"loss": 1.7528, "grad_norm": 0.3343866467475891, "learning_rate": 0.0002, "epoch": 0.47814717967874487, "step": 640}, {"loss": 1.8191, "grad_norm": 0.33878564834594727, "learning_rate": 0.0002, "epoch": 0.48561822936122523, "step": 650}, {"loss": 1.8801, "grad_norm": 0.387195885181427, "learning_rate": 0.0002, "epoch": 0.49308927904370564, "step": 660}, {"loss": 1.7559, "grad_norm": 0.3755440413951874, "learning_rate": 0.0002, "epoch": 0.500560328726186, "step": 670}, {"loss": 1.8057, "grad_norm": 0.3272816836833954, "learning_rate": 0.0002, "epoch": 0.5080313784086664, "step": 680}, {"loss": 1.8156, "grad_norm": 0.36063864827156067, "learning_rate": 0.0002, "epoch": 0.5155024280911468, "step": 690}, {"loss": 1.8397, "grad_norm": 0.35317373275756836, "learning_rate": 0.0002, "epoch": 0.5229734777736272, "step": 700}, {"loss": 1.7603, "grad_norm": 0.3561195433139801, "learning_rate": 0.0002, "epoch": 0.5304445274561076, "step": 710}, {"loss": 1.8149, "grad_norm": 0.31124624609947205, "learning_rate": 0.0002, "epoch": 0.537915577138588, "step": 720}, {"loss": 1.7434, "grad_norm": 0.3294544517993927, "learning_rate": 0.0002, "epoch": 0.5453866268210683, "step": 730}, {"loss": 1.8027, "grad_norm": 0.31933900713920593, "learning_rate": 0.0002, "epoch": 0.5528576765035488, "step": 740}, {"loss": 1.7601, "grad_norm": 0.3226020634174347, "learning_rate": 0.0002, "epoch": 0.5603287261860291, "step": 750}, {"loss": 1.7862, "grad_norm": 0.3147525489330292, "learning_rate": 0.0002, "epoch": 0.5677997758685095, "step": 760}, {"loss": 1.9028, "grad_norm": 0.32234328985214233, "learning_rate": 0.0002, "epoch": 0.57527082555099, "step": 770}, {"loss": 1.7623, "grad_norm": 0.3258664309978485, "learning_rate": 0.0002, "epoch": 0.5827418752334703, "step": 780}, {"loss": 1.7384, "grad_norm": 0.3166961967945099, "learning_rate": 0.0002, "epoch": 0.5902129249159507, "step": 790}, {"loss": 1.8799, "grad_norm": 0.35621458292007446, "learning_rate": 0.0002, "epoch": 0.597683974598431, "step": 800}, {"loss": 1.8313, "grad_norm": 0.3236999213695526, "learning_rate": 0.0002, "epoch": 0.6051550242809115, "step": 810}, {"loss": 1.7132, "grad_norm": 0.2892923653125763, "learning_rate": 0.0002, "epoch": 0.6126260739633919, "step": 820}, {"loss": 1.8709, "grad_norm": 0.4098321497440338, "learning_rate": 0.0002, "epoch": 0.6200971236458722, "step": 830}, {"loss": 1.7637, "grad_norm": 0.3337118923664093, "learning_rate": 0.0002, "epoch": 0.6275681733283527, "step": 840}, {"loss": 1.7375, "grad_norm": 0.30416029691696167, "learning_rate": 0.0002, "epoch": 0.635039223010833, "step": 850}, {"loss": 1.7419, "grad_norm": 0.3361026346683502, "learning_rate": 0.0002, "epoch": 0.6425102726933134, "step": 860}, {"loss": 1.732, "grad_norm": 0.3537365198135376, "learning_rate": 0.0002, "epoch": 0.6499813223757938, "step": 870}, {"loss": 1.7825, "grad_norm": 0.33854469656944275, "learning_rate": 0.0002, "epoch": 0.6574523720582742, "step": 880}, {"loss": 1.7561, "grad_norm": 0.3332272469997406, "learning_rate": 0.0002, "epoch": 0.6649234217407546, "step": 890}, {"loss": 1.7247, "grad_norm": 0.34954726696014404, "learning_rate": 0.0002, "epoch": 0.6723944714232349, "step": 900}, {"loss": 1.7917, "grad_norm": 0.2921750247478485, "learning_rate": 0.0002, "epoch": 0.6798655211057153, "step": 910}, {"loss": 1.7807, "grad_norm": 0.30508682131767273, "learning_rate": 0.0002, "epoch": 0.6873365707881958, "step": 920}, {"loss": 1.8082, "grad_norm": 0.32268425822257996, "learning_rate": 0.0002, "epoch": 0.6948076204706761, "step": 930}, {"loss": 1.8283, "grad_norm": 0.2844390869140625, "learning_rate": 0.0002, "epoch": 0.7022786701531565, "step": 940}, {"loss": 1.7363, "grad_norm": 0.31263890862464905, "learning_rate": 0.0002, "epoch": 0.709749719835637, "step": 950}, {"loss": 1.8081, "grad_norm": 0.3626808822154999, "learning_rate": 0.0002, "epoch": 0.7172207695181173, "step": 960}, {"loss": 1.853, "grad_norm": 0.3322749733924866, "learning_rate": 0.0002, "epoch": 0.7246918192005977, "step": 970}, {"loss": 1.7912, "grad_norm": 0.29177871346473694, "learning_rate": 0.0002, "epoch": 0.732162868883078, "step": 980}, {"loss": 1.8447, "grad_norm": 0.35405513644218445, "learning_rate": 0.0002, "epoch": 0.7396339185655585, "step": 990}, {"loss": 1.7008, "grad_norm": 0.39318400621414185, "learning_rate": 0.0002, "epoch": 0.7471049682480388, "step": 1000}, {"loss": 1.7803, "grad_norm": 0.29401418566703796, "learning_rate": 0.0002, "epoch": 0.7545760179305192, "step": 1010}, {"loss": 1.7649, "grad_norm": 0.3271748721599579, "learning_rate": 0.0002, "epoch": 0.7620470676129997, "step": 1020}, {"loss": 1.7266, "grad_norm": 0.30883970856666565, "learning_rate": 0.0002, "epoch": 0.76951811729548, "step": 1030}, {"loss": 1.7722, "grad_norm": 0.3411838412284851, "learning_rate": 0.0002, "epoch": 0.7769891669779604, "step": 1040}, {"loss": 1.829, "grad_norm": 0.30608129501342773, "learning_rate": 0.0002, "epoch": 0.7844602166604407, "step": 1050}, {"loss": 1.7815, "grad_norm": 0.30899080634117126, "learning_rate": 0.0002, "epoch": 0.7919312663429212, "step": 1060}, {"loss": 1.7625, "grad_norm": 0.3160453140735626, "learning_rate": 0.0002, "epoch": 0.7994023160254016, "step": 1070}, {"loss": 1.8452, "grad_norm": 0.30947187542915344, "learning_rate": 0.0002, "epoch": 0.8068733657078819, "step": 1080}, {"loss": 1.7418, "grad_norm": 0.3103134036064148, "learning_rate": 0.0002, "epoch": 0.8143444153903624, "step": 1090}, {"loss": 1.842, "grad_norm": 0.31771138310432434, "learning_rate": 0.0002, "epoch": 0.8218154650728428, "step": 1100}, {"loss": 1.7918, "grad_norm": 0.5860997438430786, "learning_rate": 0.0002, "epoch": 0.8292865147553231, "step": 1110}, {"loss": 1.8443, "grad_norm": 0.3230148255825043, "learning_rate": 0.0002, "epoch": 0.8367575644378035, "step": 1120}, {"loss": 1.8478, "grad_norm": 0.29611510038375854, "learning_rate": 0.0002, "epoch": 0.8442286141202839, "step": 1130}, {"loss": 1.7673, "grad_norm": 0.3373654782772064, "learning_rate": 0.0002, "epoch": 0.8516996638027643, "step": 1140}, {"loss": 1.7997, "grad_norm": 0.3474279046058655, "learning_rate": 0.0002, "epoch": 0.8591707134852447, "step": 1150}, {"loss": 1.75, "grad_norm": 0.35057875514030457, "learning_rate": 0.0002, "epoch": 0.866641763167725, "step": 1160}, {"loss": 1.8273, "grad_norm": 0.39537495374679565, "learning_rate": 0.0002, "epoch": 0.8741128128502055, "step": 1170}, {"loss": 1.7682, "grad_norm": 0.3714233636856079, "learning_rate": 0.0002, "epoch": 0.8815838625326858, "step": 1180}, {"loss": 1.7549, "grad_norm": 0.2950296998023987, "learning_rate": 0.0002, "epoch": 0.8890549122151662, "step": 1190}, {"loss": 1.7612, "grad_norm": 0.38182979822158813, "learning_rate": 0.0002, "epoch": 0.8965259618976467, "step": 1200}, {"loss": 1.827, "grad_norm": 0.27883678674697876, "learning_rate": 0.0002, "epoch": 0.903997011580127, "step": 1210}, {"loss": 1.7623, "grad_norm": 0.33874374628067017, "learning_rate": 0.0002, "epoch": 0.9114680612626074, "step": 1220}, {"loss": 1.7334, "grad_norm": 0.3014272153377533, "learning_rate": 0.0002, "epoch": 0.9189391109450877, "step": 1230}, {"loss": 1.8235, "grad_norm": 0.3194271922111511, "learning_rate": 0.0002, "epoch": 0.9264101606275682, "step": 1240}, {"loss": 1.7924, "grad_norm": 0.3049403429031372, "learning_rate": 0.0002, "epoch": 0.9338812103100486, "step": 1250}, {"loss": 1.7535, "grad_norm": 0.30621254444122314, "learning_rate": 0.0002, "epoch": 0.9413522599925289, "step": 1260}, {"loss": 1.8287, "grad_norm": 0.28675132989883423, "learning_rate": 0.0002, "epoch": 0.9488233096750094, "step": 1270}, {"loss": 1.7586, "grad_norm": 0.3322032690048218, "learning_rate": 0.0002, "epoch": 0.9562943593574897, "step": 1280}, {"loss": 1.8054, "grad_norm": 0.35408294200897217, "learning_rate": 0.0002, "epoch": 0.9637654090399701, "step": 1290}, {"loss": 1.7343, "grad_norm": 0.36386919021606445, "learning_rate": 0.0002, "epoch": 0.9712364587224505, "step": 1300}, {"loss": 1.8633, "grad_norm": 0.32338324189186096, "learning_rate": 0.0002, "epoch": 0.9787075084049309, "step": 1310}, {"loss": 1.7724, "grad_norm": 0.3714013993740082, "learning_rate": 0.0002, "epoch": 0.9861785580874113, "step": 1320}, {"loss": 1.7766, "grad_norm": 0.3133082389831543, "learning_rate": 0.0002, "epoch": 0.9936496077698916, "step": 1330}, {"eval_loss": 1.8051470518112183, "eval_runtime": 38.6332, "eval_samples_per_second": 13.331, "eval_steps_per_second": 1.682, "epoch": 0.9996264475158759, "step": 1338}, {"loss": 1.8035, "grad_norm": 0.31595754623413086, "learning_rate": 0.0002, "epoch": 1.001120657452372, "step": 1340}, {"loss": 1.7486, "grad_norm": 0.3095700144767761, "learning_rate": 0.0002, "epoch": 1.0085917071348525, "step": 1350}, {"loss": 1.6981, "grad_norm": 0.34677496552467346, "learning_rate": 0.0002, "epoch": 1.0160627568173328, "step": 1360}, {"loss": 1.7377, "grad_norm": 0.29108840227127075, "learning_rate": 0.0002, "epoch": 1.0235338064998132, "step": 1370}, {"loss": 1.7194, "grad_norm": 0.32356950640678406, "learning_rate": 0.0002, "epoch": 1.0310048561822935, "step": 1380}, {"loss": 1.7593, "grad_norm": 0.4200669229030609, "learning_rate": 0.0002, "epoch": 1.038475905864774, "step": 1390}, {"loss": 1.797, "grad_norm": 0.3283711373806, "learning_rate": 0.0002, "epoch": 1.0459469555472545, "step": 1400}, {"loss": 1.7163, "grad_norm": 0.32898256182670593, "learning_rate": 0.0002, "epoch": 1.0534180052297348, "step": 1410}, {"loss": 1.7559, "grad_norm": 0.38790300488471985, "learning_rate": 0.0002, "epoch": 1.0608890549122152, "step": 1420}, {"loss": 1.6922, "grad_norm": 0.339800089597702, "learning_rate": 0.0002, "epoch": 1.0683601045946955, "step": 1430}, {"loss": 1.7076, "grad_norm": 0.3548751175403595, "learning_rate": 0.0002, "epoch": 1.075831154277176, "step": 1440}, {"loss": 1.6985, "grad_norm": 0.35114359855651855, "learning_rate": 0.0002, "epoch": 1.0833022039596563, "step": 1450}, {"loss": 1.7217, "grad_norm": 0.35226720571517944, "learning_rate": 0.0002, "epoch": 1.0907732536421366, "step": 1460}, {"loss": 1.6822, "grad_norm": 0.33665576577186584, "learning_rate": 0.0002, "epoch": 1.0982443033246172, "step": 1470}, {"loss": 1.6699, "grad_norm": 0.363889217376709, "learning_rate": 0.0002, "epoch": 1.1057153530070976, "step": 1480}, {"loss": 1.7933, "grad_norm": 0.3826201856136322, "learning_rate": 0.0002, "epoch": 1.113186402689578, "step": 1490}, {"loss": 1.7022, "grad_norm": 0.34058740735054016, "learning_rate": 0.0002, "epoch": 1.1206574523720583, "step": 1500}, {"loss": 1.6375, "grad_norm": 0.3462134301662445, "learning_rate": 0.0002, "epoch": 1.1281285020545386, "step": 1510}, {"loss": 1.7147, "grad_norm": 0.3396756052970886, "learning_rate": 0.0002, "epoch": 1.135599551737019, "step": 1520}, {"loss": 1.7219, "grad_norm": 0.32004743814468384, "learning_rate": 0.0002, "epoch": 1.1430706014194993, "step": 1530}, {"loss": 1.743, "grad_norm": 0.3397733271121979, "learning_rate": 0.0002, "epoch": 1.15054165110198, "step": 1540}, {"loss": 1.7333, "grad_norm": 0.3783262073993683, "learning_rate": 0.0002, "epoch": 1.1580127007844603, "step": 1550}, {"loss": 1.6075, "grad_norm": 0.35121291875839233, "learning_rate": 0.0002, "epoch": 1.1654837504669406, "step": 1560}, {"loss": 1.678, "grad_norm": 0.35816895961761475, "learning_rate": 0.0002, "epoch": 1.172954800149421, "step": 1570}, {"loss": 1.7143, "grad_norm": 0.33843839168548584, "learning_rate": 0.0002, "epoch": 1.1804258498319014, "step": 1580}, {"loss": 1.7434, "grad_norm": 0.3371972143650055, "learning_rate": 0.0002, "epoch": 1.1878968995143817, "step": 1590}, {"loss": 1.7671, "grad_norm": 0.36016878485679626, "learning_rate": 0.0002, "epoch": 1.195367949196862, "step": 1600}, {"loss": 1.6914, "grad_norm": 0.40879473090171814, "learning_rate": 0.0002, "epoch": 1.2028389988793426, "step": 1610}, {"loss": 1.6955, "grad_norm": 0.3216715455055237, "learning_rate": 0.0002, "epoch": 1.210310048561823, "step": 1620}, {"loss": 1.632, "grad_norm": 0.4482610821723938, "learning_rate": 0.0002, "epoch": 1.2177810982443034, "step": 1630}, {"loss": 1.6999, "grad_norm": 0.3257700502872467, "learning_rate": 0.0002, "epoch": 1.2252521479267837, "step": 1640}, {"loss": 1.7177, "grad_norm": 0.38646459579467773, "learning_rate": 0.0002, "epoch": 1.232723197609264, "step": 1650}, {"loss": 1.7081, "grad_norm": 0.4081360697746277, "learning_rate": 0.0002, "epoch": 1.2401942472917444, "step": 1660}, {"loss": 1.7519, "grad_norm": 0.4326848089694977, "learning_rate": 0.0002, "epoch": 1.2476652969742248, "step": 1670}, {"loss": 1.6752, "grad_norm": 0.346401572227478, "learning_rate": 0.0002, "epoch": 1.2551363466567054, "step": 1680}, {"loss": 1.7425, "grad_norm": 0.34536251425743103, "learning_rate": 0.0002, "epoch": 1.2626073963391857, "step": 1690}, {"loss": 1.7061, "grad_norm": 0.41359591484069824, "learning_rate": 0.0002, "epoch": 1.270078446021666, "step": 1700}, {"loss": 1.7906, "grad_norm": 0.3530874252319336, "learning_rate": 0.0002, "epoch": 1.2775494957041464, "step": 1710}, {"loss": 1.7357, "grad_norm": 0.3702719211578369, "learning_rate": 0.0002, "epoch": 1.2850205453866268, "step": 1720}, {"loss": 1.766, "grad_norm": 0.3703329563140869, "learning_rate": 0.0002, "epoch": 1.2924915950691072, "step": 1730}, {"loss": 1.7221, "grad_norm": 0.37919729948043823, "learning_rate": 0.0002, "epoch": 1.2999626447515875, "step": 1740}, {"loss": 1.7859, "grad_norm": 0.32526856660842896, "learning_rate": 0.0002, "epoch": 1.307433694434068, "step": 1750}, {"loss": 1.7117, "grad_norm": 0.36752620339393616, "learning_rate": 0.0002, "epoch": 1.3149047441165485, "step": 1760}, {"loss": 1.7335, "grad_norm": 0.3398192524909973, "learning_rate": 0.0002, "epoch": 1.3223757937990288, "step": 1770}, {"loss": 1.7492, "grad_norm": 0.37435585260391235, "learning_rate": 0.0002, "epoch": 1.3298468434815092, "step": 1780}, {"loss": 1.7393, "grad_norm": 0.35793280601501465, "learning_rate": 0.0002, "epoch": 1.3373178931639895, "step": 1790}, {"loss": 1.7266, "grad_norm": 0.35481882095336914, "learning_rate": 0.0002, "epoch": 1.3447889428464699, "step": 1800}, {"loss": 1.7456, "grad_norm": 0.3786393105983734, "learning_rate": 0.0002, "epoch": 1.3522599925289502, "step": 1810}, {"loss": 1.7169, "grad_norm": 0.33245593309402466, "learning_rate": 0.0002, "epoch": 1.3597310422114308, "step": 1820}, {"loss": 1.7577, "grad_norm": 0.35388344526290894, "learning_rate": 0.0002, "epoch": 1.3672020918939112, "step": 1830}, {"loss": 1.6968, "grad_norm": 0.3695325553417206, "learning_rate": 0.0002, "epoch": 1.3746731415763915, "step": 1840}, {"loss": 1.7086, "grad_norm": 0.3683604598045349, "learning_rate": 0.0002, "epoch": 1.382144191258872, "step": 1850}, {"loss": 1.7878, "grad_norm": 0.3753012418746948, "learning_rate": 0.0002, "epoch": 1.3896152409413522, "step": 1860}, {"loss": 1.6969, "grad_norm": 0.3331069350242615, "learning_rate": 0.0002, "epoch": 1.3970862906238326, "step": 1870}, {"loss": 1.6644, "grad_norm": 0.3877500295639038, "learning_rate": 0.0002, "epoch": 1.404557340306313, "step": 1880}, {"loss": 1.7586, "grad_norm": 0.33525151014328003, "learning_rate": 0.0002, "epoch": 1.4120283899887935, "step": 1890}, {"loss": 1.7031, "grad_norm": 0.3697299659252167, "learning_rate": 0.0002, "epoch": 1.4194994396712737, "step": 1900}, {"loss": 1.6956, "grad_norm": 0.4029286205768585, "learning_rate": 0.0002, "epoch": 1.4269704893537543, "step": 1910}, {"loss": 1.6897, "grad_norm": 0.3596203029155731, "learning_rate": 0.0002, "epoch": 1.4344415390362346, "step": 1920}, {"loss": 1.7139, "grad_norm": 0.450783908367157, "learning_rate": 0.0002, "epoch": 1.441912588718715, "step": 1930}, {"loss": 1.7243, "grad_norm": 0.3651481866836548, "learning_rate": 0.0002, "epoch": 1.4493836384011953, "step": 1940}, {"loss": 1.6637, "grad_norm": 0.3608424663543701, "learning_rate": 0.0002, "epoch": 1.4568546880836757, "step": 1950}, {"loss": 1.8285, "grad_norm": 0.39684420824050903, "learning_rate": 0.0002, "epoch": 1.4643257377661563, "step": 1960}, {"loss": 1.7514, "grad_norm": 0.34618663787841797, "learning_rate": 0.0002, "epoch": 1.4717967874486364, "step": 1970}, {"loss": 1.6655, "grad_norm": 0.4150386452674866, "learning_rate": 0.0002, "epoch": 1.479267837131117, "step": 1980}, {"loss": 1.7021, "grad_norm": 0.35500776767730713, "learning_rate": 0.0002, "epoch": 1.4867388868135973, "step": 1990}, {"loss": 1.7322, "grad_norm": 0.344144344329834, "learning_rate": 0.0002, "epoch": 1.4942099364960777, "step": 2000}, {"loss": 1.6998, "grad_norm": 0.3340149223804474, "learning_rate": 0.0002, "epoch": 1.501680986178558, "step": 2010}, {"loss": 1.7508, "grad_norm": 0.37685006856918335, "learning_rate": 0.0002, "epoch": 1.5091520358610384, "step": 2020}, {"loss": 1.8299, "grad_norm": 0.3699876368045807, "learning_rate": 0.0002, "epoch": 1.516623085543519, "step": 2030}, {"loss": 1.7357, "grad_norm": 0.3370307385921478, "learning_rate": 0.0002, "epoch": 1.5240941352259991, "step": 2040}, {"loss": 1.8044, "grad_norm": 0.37780630588531494, "learning_rate": 0.0002, "epoch": 1.5315651849084797, "step": 2050}, {"loss": 1.7408, "grad_norm": 0.370259165763855, "learning_rate": 0.0002, "epoch": 1.53903623459096, "step": 2060}, {"loss": 1.7398, "grad_norm": 0.3440011441707611, "learning_rate": 0.0002, "epoch": 1.5465072842734404, "step": 2070}, {"loss": 1.7105, "grad_norm": 0.40382063388824463, "learning_rate": 0.0002, "epoch": 1.5539783339559208, "step": 2080}, {"loss": 1.7071, "grad_norm": 0.38002029061317444, "learning_rate": 0.0002, "epoch": 1.5614493836384011, "step": 2090}, {"loss": 1.6815, "grad_norm": 0.3658451437950134, "learning_rate": 0.0002, "epoch": 1.5689204333208817, "step": 2100}, {"loss": 1.7598, "grad_norm": 0.354842871427536, "learning_rate": 0.0002, "epoch": 1.5763914830033618, "step": 2110}, {"loss": 1.6898, "grad_norm": 0.34735530614852905, "learning_rate": 0.0002, "epoch": 1.5838625326858424, "step": 2120}, {"loss": 1.7363, "grad_norm": 0.377581924200058, "learning_rate": 0.0002, "epoch": 1.5913335823683228, "step": 2130}, {"loss": 1.7789, "grad_norm": 0.41254034638404846, "learning_rate": 0.0002, "epoch": 1.5988046320508031, "step": 2140}, {"loss": 1.6782, "grad_norm": 0.3630715310573578, "learning_rate": 0.0002, "epoch": 1.6062756817332835, "step": 2150}, {"loss": 1.7531, "grad_norm": 0.36980143189430237, "learning_rate": 0.0002, "epoch": 1.6137467314157639, "step": 2160}, {"loss": 1.6847, "grad_norm": 0.3634769320487976, "learning_rate": 0.0002, "epoch": 1.6212177810982444, "step": 2170}, {"loss": 1.6367, "grad_norm": 0.3794139623641968, "learning_rate": 0.0002, "epoch": 1.6286888307807246, "step": 2180}, {"loss": 1.7064, "grad_norm": 0.359742134809494, "learning_rate": 0.0002, "epoch": 1.6361598804632052, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3770543932914734, "learning_rate": 0.0002, "epoch": 1.6436309301456855, "step": 2200}, {"loss": 1.784, "grad_norm": 0.3797036409378052, "learning_rate": 0.0002, "epoch": 1.6511019798281659, "step": 2210}, {"loss": 1.7875, "grad_norm": 0.35622093081474304, "learning_rate": 0.0002, "epoch": 1.6585730295106462, "step": 2220}, {"loss": 1.6615, "grad_norm": 0.34552520513534546, "learning_rate": 0.0002, "epoch": 1.6660440791931266, "step": 2230}, {"loss": 1.7522, "grad_norm": 0.379926860332489, "learning_rate": 0.0002, "epoch": 1.6735151288756072, "step": 2240}, {"loss": 1.7953, "grad_norm": 0.37083810567855835, "learning_rate": 0.0002, "epoch": 1.6809861785580873, "step": 2250}, {"loss": 1.7485, "grad_norm": 0.42746543884277344, "learning_rate": 0.0002, "epoch": 1.6884572282405679, "step": 2260}, {"loss": 1.776, "grad_norm": 0.3372884690761566, "learning_rate": 0.0002, "epoch": 1.6959282779230482, "step": 2270}, {"loss": 1.7604, "grad_norm": 0.35220256447792053, "learning_rate": 0.0002, "epoch": 1.7033993276055286, "step": 2280}, {"loss": 1.7154, "grad_norm": 0.3659130930900574, "learning_rate": 0.0002, "epoch": 1.710870377288009, "step": 2290}, {"loss": 1.6953, "grad_norm": 0.37629297375679016, "learning_rate": 0.0002, "epoch": 1.7183414269704893, "step": 2300}, {"loss": 1.7212, "grad_norm": 0.36312398314476013, "learning_rate": 0.0002, "epoch": 1.7258124766529699, "step": 2310}, {"loss": 1.7903, "grad_norm": 0.467709481716156, "learning_rate": 0.0002, "epoch": 1.73328352633545, "step": 2320}, {"loss": 1.696, "grad_norm": 0.38685527443885803, "learning_rate": 0.0002, "epoch": 1.7407545760179306, "step": 2330}, {"loss": 1.7041, "grad_norm": 0.3578338325023651, "learning_rate": 0.0002, "epoch": 1.748225625700411, "step": 2340}, {"loss": 1.6456, "grad_norm": 0.36057502031326294, "learning_rate": 0.0002, "epoch": 1.7556966753828913, "step": 2350}, {"loss": 1.6853, "grad_norm": 0.3615196645259857, "learning_rate": 0.0002, "epoch": 1.7631677250653717, "step": 2360}, {"loss": 1.7612, "grad_norm": 0.4118947684764862, "learning_rate": 0.0002, "epoch": 1.770638774747852, "step": 2370}, {"loss": 1.6946, "grad_norm": 0.4067276120185852, "learning_rate": 0.0002, "epoch": 1.7781098244303326, "step": 2380}, {"loss": 1.712, "grad_norm": 0.3979823887348175, "learning_rate": 0.0002, "epoch": 1.7855808741128127, "step": 2390}, {"loss": 1.7644, "grad_norm": 0.44045883417129517, "learning_rate": 0.0002, "epoch": 1.7930519237952933, "step": 2400}, {"loss": 1.7251, "grad_norm": 0.3998069167137146, "learning_rate": 0.0002, "epoch": 1.8005229734777737, "step": 2410}, {"loss": 1.7354, "grad_norm": 0.3450094759464264, "learning_rate": 0.0002, "epoch": 1.807994023160254, "step": 2420}, {"loss": 1.6998, "grad_norm": 0.3759009838104248, "learning_rate": 0.0002, "epoch": 1.8154650728427344, "step": 2430}, {"loss": 1.7706, "grad_norm": 0.34347015619277954, "learning_rate": 0.0002, "epoch": 1.8229361225252148, "step": 2440}, {"loss": 1.7345, "grad_norm": 0.3511228859424591, "learning_rate": 0.0002, "epoch": 1.8304071722076953, "step": 2450}, {"loss": 1.6909, "grad_norm": 0.36853715777397156, "learning_rate": 0.0002, "epoch": 1.8378782218901755, "step": 2460}, {"loss": 1.6931, "grad_norm": 0.40659376978874207, "learning_rate": 0.0002, "epoch": 1.845349271572656, "step": 2470}, {"loss": 1.7626, "grad_norm": 0.39621320366859436, "learning_rate": 0.0002, "epoch": 1.8528203212551362, "step": 2480}, {"loss": 1.7427, "grad_norm": 0.3753979504108429, "learning_rate": 0.0002, "epoch": 1.8602913709376168, "step": 2490}, {"loss": 1.6622, "grad_norm": 0.3811938464641571, "learning_rate": 0.0002, "epoch": 1.8677624206200971, "step": 2500}, {"loss": 1.7718, "grad_norm": 0.3432596027851105, "learning_rate": 0.0002, "epoch": 1.8752334703025775, "step": 2510}, {"loss": 1.7488, "grad_norm": 0.3670712113380432, "learning_rate": 0.0002, "epoch": 1.882704519985058, "step": 2520}, {"loss": 1.705, "grad_norm": 0.40907177329063416, "learning_rate": 0.0002, "epoch": 1.8901755696675382, "step": 2530}, {"loss": 1.7148, "grad_norm": 0.3821999728679657, "learning_rate": 0.0002, "epoch": 1.8976466193500188, "step": 2540}, {"loss": 1.7934, "grad_norm": 0.36173978447914124, "learning_rate": 0.0002, "epoch": 1.905117669032499, "step": 2550}, {"loss": 1.6939, "grad_norm": 0.38990336656570435, "learning_rate": 0.0002, "epoch": 1.9125887187149795, "step": 2560}, {"loss": 1.6893, "grad_norm": 0.35242322087287903, "learning_rate": 0.0002, "epoch": 1.9200597683974598, "step": 2570}, {"loss": 1.7268, "grad_norm": 0.3506428003311157, "learning_rate": 0.0002, "epoch": 1.9275308180799402, "step": 2580}, {"loss": 1.6953, "grad_norm": 0.39540135860443115, "learning_rate": 0.0002, "epoch": 1.9350018677624208, "step": 2590}, {"loss": 1.6511, "grad_norm": 0.3444725573062897, "learning_rate": 0.0002, "epoch": 1.942472917444901, "step": 2600}, {"loss": 1.7259, "grad_norm": 0.3963521718978882, "learning_rate": 0.0002, "epoch": 1.9499439671273815, "step": 2610}, {"loss": 1.6946, "grad_norm": 0.3689815402030945, "learning_rate": 0.0002, "epoch": 1.9574150168098616, "step": 2620}, {"loss": 1.7384, "grad_norm": 0.3482626676559448, "learning_rate": 0.0002, "epoch": 1.9648860664923422, "step": 2630}, {"loss": 1.7048, "grad_norm": 0.35832616686820984, "learning_rate": 0.0002, "epoch": 1.9723571161748226, "step": 2640}, {"loss": 1.6681, "grad_norm": 0.4776208996772766, "learning_rate": 0.0002, "epoch": 1.979828165857303, "step": 2650}, {"loss": 1.6696, "grad_norm": 0.32570165395736694, "learning_rate": 0.0002, "epoch": 1.9872992155397835, "step": 2660}, {"loss": 1.7232, "grad_norm": 0.3380725085735321, "learning_rate": 0.0002, "epoch": 1.9947702652222636, "step": 2670}, {"eval_loss": 1.8046749830245972, "eval_runtime": 38.5096, "eval_samples_per_second": 13.373, "eval_steps_per_second": 1.688, "epoch": 2.0, "step": 2677}, {"loss": 1.7265, "grad_norm": 0.36817631125450134, "learning_rate": 0.0002, "epoch": 2.002241314904744, "step": 2680}, {"loss": 1.548, "grad_norm": 0.4056456685066223, "learning_rate": 0.0002, "epoch": 2.0097123645872244, "step": 2690}, {"loss": 1.5515, "grad_norm": 0.37416863441467285, "learning_rate": 0.0002, "epoch": 2.017183414269705, "step": 2700}, {"loss": 1.5895, "grad_norm": 0.4273638427257538, "learning_rate": 0.0002, "epoch": 2.024654463952185, "step": 2710}, {"loss": 1.5884, "grad_norm": 0.36497923731803894, "learning_rate": 0.0002, "epoch": 2.0321255136346656, "step": 2720}, {"loss": 1.6999, "grad_norm": 0.5021994113922119, "learning_rate": 0.0002, "epoch": 2.0395965633171462, "step": 2730}, {"loss": 1.6655, "grad_norm": 0.45896220207214355, "learning_rate": 0.0002, "epoch": 2.0470676129996264, "step": 2740}, {"loss": 1.6305, "grad_norm": 0.3973815143108368, "learning_rate": 0.0002, "epoch": 2.054538662682107, "step": 2750}, {"loss": 1.6301, "grad_norm": 0.4521815776824951, "learning_rate": 0.0002, "epoch": 2.062009712364587, "step": 2760}, {"loss": 1.6189, "grad_norm": 0.42775002121925354, "learning_rate": 0.0002, "epoch": 2.0694807620470677, "step": 2770}, {"loss": 1.6491, "grad_norm": 0.48158586025238037, "learning_rate": 0.0002, "epoch": 2.076951811729548, "step": 2780}, {"loss": 1.6301, "grad_norm": 0.4612371623516083, "learning_rate": 0.0002, "epoch": 2.0844228614120284, "step": 2790}, {"loss": 1.6327, "grad_norm": 0.42536866664886475, "learning_rate": 0.0002, "epoch": 2.091893911094509, "step": 2800}, {"loss": 1.651, "grad_norm": 0.48515772819519043, "learning_rate": 0.0002, "epoch": 2.099364960776989, "step": 2810}, {"loss": 1.6829, "grad_norm": 0.41418662667274475, "learning_rate": 0.0002, "epoch": 2.1068360104594697, "step": 2820}, {"loss": 1.6266, "grad_norm": 0.4683697819709778, "learning_rate": 0.0002, "epoch": 2.11430706014195, "step": 2830}, {"loss": 1.6586, "grad_norm": 0.4484657049179077, "learning_rate": 0.0002, "epoch": 2.1217781098244304, "step": 2840}, {"loss": 1.6483, "grad_norm": 0.6621400713920593, "learning_rate": 0.0002, "epoch": 2.1292491595069105, "step": 2850}, {"loss": 1.5755, "grad_norm": 0.45074811577796936, "learning_rate": 0.0002, "epoch": 2.136720209189391, "step": 2860}, {"loss": 1.6456, "grad_norm": 0.3513113558292389, "learning_rate": 0.0002, "epoch": 2.1441912588718717, "step": 2870}, {"loss": 1.6081, "grad_norm": 0.40411314368247986, "learning_rate": 0.0002, "epoch": 2.151662308554352, "step": 2880}, {"loss": 1.6323, "grad_norm": 0.4121065139770508, "learning_rate": 0.0002, "epoch": 2.1591333582368324, "step": 2890}, {"loss": 1.6324, "grad_norm": 0.44723689556121826, "learning_rate": 0.0002, "epoch": 2.1666044079193125, "step": 2900}, {"loss": 1.5699, "grad_norm": 0.4226122498512268, "learning_rate": 0.0002, "epoch": 2.174075457601793, "step": 2910}, {"loss": 1.5652, "grad_norm": 0.46617650985717773, "learning_rate": 0.0002, "epoch": 2.1815465072842732, "step": 2920}, {"loss": 1.6378, "grad_norm": 0.4506422281265259, "learning_rate": 0.0002, "epoch": 2.189017556966754, "step": 2930}, {"loss": 1.6112, "grad_norm": 0.4892672896385193, "learning_rate": 0.0002, "epoch": 2.1964886066492344, "step": 2940}, {"loss": 1.6176, "grad_norm": 0.44095516204833984, "learning_rate": 0.0002, "epoch": 2.2039596563317145, "step": 2950}, {"loss": 1.6058, "grad_norm": 0.41522109508514404, "learning_rate": 0.0002, "epoch": 2.211430706014195, "step": 2960}, {"loss": 1.5964, "grad_norm": 0.4860858917236328, "learning_rate": 0.0002, "epoch": 2.2189017556966752, "step": 2970}, {"loss": 1.6427, "grad_norm": 0.42662516236305237, "learning_rate": 0.0002, "epoch": 2.226372805379156, "step": 2980}, {"loss": 1.6313, "grad_norm": 0.4390648305416107, "learning_rate": 0.0002, "epoch": 2.233843855061636, "step": 2990}, {"loss": 1.5992, "grad_norm": 0.47515565156936646, "learning_rate": 0.0002, "epoch": 2.2413149047441165, "step": 3000}, {"loss": 1.5563, "grad_norm": 0.4104543924331665, "learning_rate": 0.0002, "epoch": 2.248785954426597, "step": 3010}, {"loss": 1.6895, "grad_norm": 0.4404028654098511, "learning_rate": 0.0002, "epoch": 2.2562570041090773, "step": 3020}, {"loss": 1.6088, "grad_norm": 0.4717366695404053, "learning_rate": 0.0002, "epoch": 2.263728053791558, "step": 3030}, {"loss": 1.7287, "grad_norm": 0.48345857858657837, "learning_rate": 0.0002, "epoch": 2.271199103474038, "step": 3040}, {"loss": 1.681, "grad_norm": 0.5312452912330627, "learning_rate": 0.0002, "epoch": 2.2786701531565186, "step": 3050}, {"loss": 1.5901, "grad_norm": 0.5073099732398987, "learning_rate": 0.0002, "epoch": 2.2861412028389987, "step": 3060}, {"loss": 1.6914, "grad_norm": 0.5027463436126709, "learning_rate": 0.0002, "epoch": 2.2936122525214793, "step": 3070}, {"loss": 1.5862, "grad_norm": 0.5436304807662964, "learning_rate": 0.0002, "epoch": 2.30108330220396, "step": 3080}, {"loss": 1.5763, "grad_norm": 0.4701065123081207, "learning_rate": 0.0002, "epoch": 2.30855435188644, "step": 3090}, {"loss": 1.6177, "grad_norm": 0.46988746523857117, "learning_rate": 0.0002, "epoch": 2.3160254015689206, "step": 3100}, {"loss": 1.6502, "grad_norm": 0.45112869143486023, "learning_rate": 0.0002, "epoch": 2.3234964512514007, "step": 3110}, {"loss": 1.6291, "grad_norm": 0.5173566937446594, "learning_rate": 0.0002, "epoch": 2.3309675009338813, "step": 3120}, {"loss": 1.6743, "grad_norm": 0.40345850586891174, "learning_rate": 0.0002, "epoch": 2.3384385506163614, "step": 3130}, {"loss": 1.621, "grad_norm": 0.4218924939632416, "learning_rate": 0.0002, "epoch": 2.345909600298842, "step": 3140}, {"loss": 1.6341, "grad_norm": 0.41857317090034485, "learning_rate": 0.0002, "epoch": 2.3533806499813226, "step": 3150}, {"loss": 1.6087, "grad_norm": 0.4197218418121338, "learning_rate": 0.0002, "epoch": 2.3608516996638027, "step": 3160}, {"loss": 1.6572, "grad_norm": 0.4260677397251129, "learning_rate": 0.0002, "epoch": 2.3683227493462833, "step": 3170}, {"loss": 1.6376, "grad_norm": 0.4209042191505432, "learning_rate": 0.0002, "epoch": 2.3757937990287634, "step": 3180}, {"loss": 1.634, "grad_norm": 0.4092234969139099, "learning_rate": 0.0002, "epoch": 2.383264848711244, "step": 3190}, {"loss": 1.6339, "grad_norm": 0.4928431510925293, "learning_rate": 0.0002, "epoch": 2.390735898393724, "step": 3200}, {"loss": 1.6015, "grad_norm": 0.49252402782440186, "learning_rate": 0.0002, "epoch": 2.3982069480762047, "step": 3210}, {"loss": 1.5773, "grad_norm": 0.4368397295475006, "learning_rate": 0.0002, "epoch": 2.4056779977586853, "step": 3220}, {"loss": 1.6629, "grad_norm": 0.46122390031814575, "learning_rate": 0.0002, "epoch": 2.4131490474411654, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4272301197052002, "learning_rate": 0.0002, "epoch": 2.420620097123646, "step": 3240}, {"loss": 1.5961, "grad_norm": 0.41480937600135803, "learning_rate": 0.0002, "epoch": 2.428091146806126, "step": 3250}, {"loss": 1.6281, "grad_norm": 0.48911941051483154, "learning_rate": 0.0002, "epoch": 2.4355621964886067, "step": 3260}, {"loss": 1.6846, "grad_norm": 0.4444098472595215, "learning_rate": 0.0002, "epoch": 2.443033246171087, "step": 3270}, {"loss": 1.6961, "grad_norm": 0.5111684799194336, "learning_rate": 0.0002, "epoch": 2.4505042958535674, "step": 3280}, {"loss": 1.6152, "grad_norm": 0.5058825016021729, "learning_rate": 0.0002, "epoch": 2.457975345536048, "step": 3290}, {"loss": 1.625, "grad_norm": 0.44173210859298706, "learning_rate": 0.0002, "epoch": 2.465446395218528, "step": 3300}, {"loss": 1.6491, "grad_norm": 0.4659745991230011, "learning_rate": 0.0002, "epoch": 2.4729174449010087, "step": 3310}, {"loss": 1.6114, "grad_norm": 0.47237497568130493, "learning_rate": 0.0002, "epoch": 2.480388494583489, "step": 3320}, {"loss": 1.6193, "grad_norm": 0.47303131222724915, "learning_rate": 0.0002, "epoch": 2.4878595442659694, "step": 3330}, {"loss": 1.7256, "grad_norm": 0.4522389769554138, "learning_rate": 0.0002, "epoch": 2.4953305939484496, "step": 3340}, {"loss": 1.6834, "grad_norm": 0.4467332363128662, "learning_rate": 0.0002, "epoch": 2.50280164363093, "step": 3350}, {"loss": 1.6108, "grad_norm": 0.4413762092590332, "learning_rate": 0.0002, "epoch": 2.5102726933134107, "step": 3360}, {"loss": 1.537, "grad_norm": 0.495514452457428, "learning_rate": 0.0002, "epoch": 2.517743742995891, "step": 3370}, {"loss": 1.5839, "grad_norm": 0.4429773986339569, "learning_rate": 0.0002, "epoch": 2.5252147926783715, "step": 3380}, {"loss": 1.6522, "grad_norm": 0.4589079022407532, "learning_rate": 0.0002, "epoch": 2.5326858423608516, "step": 3390}, {"loss": 1.6529, "grad_norm": 0.4683997333049774, "learning_rate": 0.0002, "epoch": 2.540156892043332, "step": 3400}, {"loss": 1.6745, "grad_norm": 0.4651731252670288, "learning_rate": 0.0002, "epoch": 2.5476279417258123, "step": 3410}, {"loss": 1.5918, "grad_norm": 0.45818084478378296, "learning_rate": 0.0002, "epoch": 2.555098991408293, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.45209529995918274, "learning_rate": 0.0002, "epoch": 2.5625700410907735, "step": 3430}, {"loss": 1.5606, "grad_norm": 0.4344733655452728, "learning_rate": 0.0002, "epoch": 2.5700410907732536, "step": 3440}, {"loss": 1.6748, "grad_norm": 0.47435566782951355, "learning_rate": 0.0002, "epoch": 2.577512140455734, "step": 3450}, {"loss": 1.6237, "grad_norm": 0.43841999769210815, "learning_rate": 0.0002, "epoch": 2.5849831901382143, "step": 3460}, {"loss": 1.7207, "grad_norm": 0.4323869049549103, "learning_rate": 0.0002, "epoch": 2.592454239820695, "step": 3470}, {"loss": 1.5494, "grad_norm": 0.44355881214141846, "learning_rate": 0.0002, "epoch": 2.599925289503175, "step": 3480}, {"loss": 1.665, "grad_norm": 0.45847779512405396, "learning_rate": 0.0002, "epoch": 2.6073963391856556, "step": 3490}, {"loss": 1.6006, "grad_norm": 0.4411061704158783, "learning_rate": 0.0002, "epoch": 2.614867388868136, "step": 3500}, {"loss": 1.5868, "grad_norm": 0.4446796178817749, "learning_rate": 0.0002, "epoch": 2.6223384385506163, "step": 3510}, {"loss": 1.5946, "grad_norm": 0.41969653964042664, "learning_rate": 0.0002, "epoch": 2.629809488233097, "step": 3520}, {"loss": 1.6798, "grad_norm": 0.5263747572898865, "learning_rate": 0.0002, "epoch": 2.637280537915577, "step": 3530}, {"loss": 1.6309, "grad_norm": 0.47719451785087585, "learning_rate": 0.0002, "epoch": 2.6447515875980576, "step": 3540}, {"loss": 1.7024, "grad_norm": 0.46574118733406067, "learning_rate": 0.0002, "epoch": 2.6522226372805378, "step": 3550}, {"loss": 1.618, "grad_norm": 0.46867135167121887, "learning_rate": 0.0002, "epoch": 2.6596936869630183, "step": 3560}, {"loss": 1.5885, "grad_norm": 0.4441198706626892, "learning_rate": 0.0002, "epoch": 2.667164736645499, "step": 3570}, {"loss": 1.6426, "grad_norm": 0.4871319830417633, "learning_rate": 0.0002, "epoch": 2.674635786327979, "step": 3580}, {"loss": 1.6575, "grad_norm": 0.43900373578071594, "learning_rate": 0.0002, "epoch": 2.6821068360104596, "step": 3590}, {"loss": 1.6071, "grad_norm": 0.42509549856185913, "learning_rate": 0.0002, "epoch": 2.6895778856929398, "step": 3600}, {"loss": 1.5651, "grad_norm": 0.4691086709499359, "learning_rate": 0.0002, "epoch": 2.6970489353754203, "step": 3610}, {"loss": 1.5491, "grad_norm": 0.46318942308425903, "learning_rate": 0.0002, "epoch": 2.7045199850579005, "step": 3620}, {"loss": 1.5422, "grad_norm": 0.44631096720695496, "learning_rate": 0.0002, "epoch": 2.711991034740381, "step": 3630}, {"loss": 1.6831, "grad_norm": 0.42315489053726196, "learning_rate": 0.0002, "epoch": 2.7194620844228616, "step": 3640}, {"loss": 1.6008, "grad_norm": 0.4971241056919098, "learning_rate": 0.0002, "epoch": 2.7269331341053418, "step": 3650}, {"loss": 1.6042, "grad_norm": 0.4578486382961273, "learning_rate": 0.0002, "epoch": 2.7344041837878224, "step": 3660}, {"loss": 1.6076, "grad_norm": 0.46584776043891907, "learning_rate": 0.0002, "epoch": 2.7418752334703025, "step": 3670}, {"loss": 1.6809, "grad_norm": 0.4951731264591217, "learning_rate": 0.0002, "epoch": 2.749346283152783, "step": 3680}, {"loss": 1.6226, "grad_norm": 0.4935225546360016, "learning_rate": 0.0002, "epoch": 2.756817332835263, "step": 3690}, {"loss": 1.5878, "grad_norm": 0.41805586218833923, "learning_rate": 0.0002, "epoch": 2.764288382517744, "step": 3700}, {"loss": 1.7173, "grad_norm": 0.4417555630207062, "learning_rate": 0.0002, "epoch": 2.7717594322002244, "step": 3710}, {"loss": 1.6398, "grad_norm": 0.48229655623435974, "learning_rate": 0.0002, "epoch": 2.7792304818827045, "step": 3720}, {"loss": 1.6074, "grad_norm": 0.48562315106391907, "learning_rate": 0.0002, "epoch": 2.786701531565185, "step": 3730}, {"loss": 1.607, "grad_norm": 0.4473940432071686, "learning_rate": 0.0002, "epoch": 2.794172581247665, "step": 3740}, {"loss": 1.6065, "grad_norm": 0.4626813232898712, "learning_rate": 0.0002, "epoch": 2.801643630930146, "step": 3750}, {"loss": 1.6296, "grad_norm": 0.4339792728424072, "learning_rate": 0.0002, "epoch": 2.809114680612626, "step": 3760}, {"loss": 1.6815, "grad_norm": 0.5250858068466187, "learning_rate": 0.0002, "epoch": 2.8165857302951065, "step": 3770}, {"loss": 1.6644, "grad_norm": 0.4537523090839386, "learning_rate": 0.0002, "epoch": 2.824056779977587, "step": 3780}, {"loss": 1.6535, "grad_norm": 0.5646113157272339, "learning_rate": 0.0002, "epoch": 2.831527829660067, "step": 3790}, {"loss": 1.5712, "grad_norm": 0.44243332743644714, "learning_rate": 0.0002, "epoch": 2.8389988793425474, "step": 3800}, {"loss": 1.6478, "grad_norm": 0.4585791826248169, "learning_rate": 0.0002, "epoch": 2.846469929025028, "step": 3810}, {"loss": 1.6854, "grad_norm": 0.489702045917511, "learning_rate": 0.0002, "epoch": 2.8539409787075085, "step": 3820}, {"loss": 1.7066, "grad_norm": 0.502470850944519, "learning_rate": 0.0002, "epoch": 2.8614120283899886, "step": 3830}, {"loss": 1.5785, "grad_norm": 0.4395960867404938, "learning_rate": 0.0002, "epoch": 2.8688830780724692, "step": 3840}, {"loss": 1.6434, "grad_norm": 0.4348670244216919, "learning_rate": 0.0002, "epoch": 2.87635412775495, "step": 3850}, {"loss": 1.6163, "grad_norm": 0.48852720856666565, "learning_rate": 0.0002, "epoch": 2.88382517743743, "step": 3860}, {"loss": 1.5916, "grad_norm": 0.45317450165748596, "learning_rate": 0.0002, "epoch": 2.89129622711991, "step": 3870}, {"loss": 1.6486, "grad_norm": 0.4732758700847626, "learning_rate": 0.0002, "epoch": 2.8987672768023907, "step": 3880}, {"loss": 1.6758, "grad_norm": 0.45238012075424194, "learning_rate": 0.0002, "epoch": 2.9062383264848712, "step": 3890}, {"loss": 1.6228, "grad_norm": 0.48838064074516296, "learning_rate": 0.0002, "epoch": 2.9137093761673514, "step": 3900}, {"loss": 1.658, "grad_norm": 0.43496349453926086, "learning_rate": 0.0002, "epoch": 2.921180425849832, "step": 3910}, {"loss": 1.7063, "grad_norm": 0.47963935136795044, "learning_rate": 0.0002, "epoch": 2.9286514755323125, "step": 3920}, {"loss": 1.6553, "grad_norm": 0.4544987976551056, "learning_rate": 0.0002, "epoch": 2.9361225252147927, "step": 3930}, {"loss": 1.6192, "grad_norm": 0.4622892141342163, "learning_rate": 0.0002, "epoch": 2.943593574897273, "step": 3940}, {"loss": 1.6178, "grad_norm": 0.47026222944259644, "learning_rate": 0.0002, "epoch": 2.9510646245797534, "step": 3950}, {"loss": 1.6612, "grad_norm": 0.4549552798271179, "learning_rate": 0.0002, "epoch": 2.958535674262234, "step": 3960}, {"loss": 1.6458, "grad_norm": 0.46647515892982483, "learning_rate": 0.0002, "epoch": 2.966006723944714, "step": 3970}, {"loss": 1.6051, "grad_norm": 0.45095112919807434, "learning_rate": 0.0002, "epoch": 2.9734777736271947, "step": 3980}, {"loss": 1.6471, "grad_norm": 0.4690017104148865, "learning_rate": 0.0002, "epoch": 2.9809488233096753, "step": 3990}, {"loss": 1.6061, "grad_norm": 0.4603444039821625, "learning_rate": 0.0002, "epoch": 2.9884198729921554, "step": 4000}, {"loss": 1.6431, "grad_norm": 0.4743294417858124, "learning_rate": 0.0002, "epoch": 2.9958909226746355, "step": 4010}, {"eval_loss": 1.8252571821212769, "eval_runtime": 38.7853, "eval_samples_per_second": 13.278, "eval_steps_per_second": 1.676, "epoch": 2.999626447515876, "step": 4015}, {"loss": 1.6512, "grad_norm": 0.4919724464416504, "learning_rate": 0.0002, "epoch": 3.003361972357116, "step": 4020}, {"loss": 1.5354, "grad_norm": 0.4747185707092285, "learning_rate": 0.0002, "epoch": 3.0108330220395967, "step": 4030}, {"loss": 1.568, "grad_norm": 0.4797595143318176, "learning_rate": 0.0002, "epoch": 3.018304071722077, "step": 4040}, {"loss": 1.5194, "grad_norm": 0.5450999140739441, "learning_rate": 0.0002, "epoch": 3.0257751214045574, "step": 4050}, {"loss": 1.5065, "grad_norm": 0.49058812856674194, "learning_rate": 0.0002, "epoch": 3.0332461710870375, "step": 4060}, {"loss": 1.4884, "grad_norm": 0.5219563841819763, "learning_rate": 0.0002, "epoch": 3.040717220769518, "step": 4070}, {"loss": 1.4742, "grad_norm": 0.515628457069397, "learning_rate": 0.0002, "epoch": 3.0481882704519987, "step": 4080}, {"loss": 1.5313, "grad_norm": 0.6145984530448914, "learning_rate": 0.0002, "epoch": 3.055659320134479, "step": 4090}, {"loss": 1.4989, "grad_norm": 0.6067144274711609, "learning_rate": 0.0002, "epoch": 3.0631303698169594, "step": 4100}, {"loss": 1.528, "grad_norm": 0.5773133039474487, "learning_rate": 0.0002, "epoch": 3.0706014194994395, "step": 4110}, {"loss": 1.5374, "grad_norm": 0.6894241571426392, "learning_rate": 0.0002, "epoch": 3.07807246918192, "step": 4120}, {"loss": 1.5422, "grad_norm": 0.6422514915466309, "learning_rate": 0.0002, "epoch": 3.0855435188644003, "step": 4130}, {"loss": 1.4724, "grad_norm": 0.6119855046272278, "learning_rate": 0.0002, "epoch": 3.093014568546881, "step": 4140}, {"loss": 1.5361, "grad_norm": 0.5847280025482178, "learning_rate": 0.0002, "epoch": 3.1004856182293614, "step": 4150}, {"loss": 1.5151, "grad_norm": 0.5401515960693359, "learning_rate": 0.0002, "epoch": 3.1079566679118416, "step": 4160}, {"loss": 1.502, "grad_norm": 0.6501587629318237, "learning_rate": 0.0002, "epoch": 3.115427717594322, "step": 4170}, {"loss": 1.4952, "grad_norm": 0.5988039374351501, "learning_rate": 0.0002, "epoch": 3.1228987672768023, "step": 4180}, {"loss": 1.5287, "grad_norm": 0.4982665181159973, "learning_rate": 0.0002, "epoch": 3.130369816959283, "step": 4190}, {"loss": 1.5078, "grad_norm": 0.5548039078712463, "learning_rate": 0.0002, "epoch": 3.137840866641763, "step": 4200}, {"loss": 1.4904, "grad_norm": 0.5920777320861816, "learning_rate": 0.0002, "epoch": 3.1453119163242436, "step": 4210}, {"loss": 1.442, "grad_norm": 0.6965190172195435, "learning_rate": 0.0002, "epoch": 3.152782966006724, "step": 4220}, {"loss": 1.557, "grad_norm": 0.5196244716644287, "learning_rate": 0.0002, "epoch": 3.1602540156892043, "step": 4230}, {"loss": 1.5706, "grad_norm": 0.6942682266235352, "learning_rate": 0.0002, "epoch": 3.167725065371685, "step": 4240}, {"loss": 1.5407, "grad_norm": 0.5765156149864197, "learning_rate": 0.0002, "epoch": 3.175196115054165, "step": 4250}, {"loss": 1.4963, "grad_norm": 0.5801976919174194, "learning_rate": 0.0002, "epoch": 3.1826671647366456, "step": 4260}, {"loss": 1.4988, "grad_norm": 0.6260752081871033, "learning_rate": 0.0002, "epoch": 3.1901382144191257, "step": 4270}, {"loss": 1.5074, "grad_norm": 0.6610770225524902, "learning_rate": 0.0002, "epoch": 3.1976092641016063, "step": 4280}, {"loss": 1.4657, "grad_norm": 0.5762143135070801, "learning_rate": 0.0002, "epoch": 3.205080313784087, "step": 4290}, {"loss": 1.5181, "grad_norm": 0.5926990509033203, "learning_rate": 0.0002, "epoch": 3.212551363466567, "step": 4300}, {"loss": 1.5492, "grad_norm": 0.7373854517936707, "learning_rate": 0.0002, "epoch": 3.2200224131490476, "step": 4310}, {"loss": 1.4648, "grad_norm": 0.5963311195373535, "learning_rate": 0.0002, "epoch": 3.2274934628315277, "step": 4320}, {"loss": 1.5262, "grad_norm": 0.5754616856575012, "learning_rate": 0.0002, "epoch": 3.2349645125140083, "step": 4330}, {"loss": 1.4767, "grad_norm": 0.6116095781326294, "learning_rate": 0.0002, "epoch": 3.2424355621964884, "step": 4340}, {"loss": 1.5008, "grad_norm": 0.6001536846160889, "learning_rate": 0.0002, "epoch": 3.249906611878969, "step": 4350}, {"loss": 1.5738, "grad_norm": 0.5270227789878845, "learning_rate": 0.0002, "epoch": 3.257377661561449, "step": 4360}, {"loss": 1.5235, "grad_norm": 0.6666602492332458, "learning_rate": 0.0002, "epoch": 3.2648487112439297, "step": 4370}, {"loss": 1.5665, "grad_norm": 0.520310640335083, "learning_rate": 0.0002, "epoch": 3.2723197609264103, "step": 4380}, {"loss": 1.542, "grad_norm": 0.5165975093841553, "learning_rate": 0.0002, "epoch": 3.2797908106088904, "step": 4390}, {"loss": 1.4746, "grad_norm": 0.6080228686332703, "learning_rate": 0.0002, "epoch": 3.287261860291371, "step": 4400}, {"loss": 1.4901, "grad_norm": 0.670122504234314, "learning_rate": 0.0002, "epoch": 3.294732909973851, "step": 4410}, {"loss": 1.4677, "grad_norm": 0.6019457578659058, "learning_rate": 0.0002, "epoch": 3.3022039596563317, "step": 4420}, {"loss": 1.4249, "grad_norm": 0.5519300103187561, "learning_rate": 0.0002, "epoch": 3.309675009338812, "step": 4430}, {"loss": 1.555, "grad_norm": 0.5958521962165833, "learning_rate": 0.0002, "epoch": 3.3171460590212924, "step": 4440}, {"loss": 1.5067, "grad_norm": 0.5552705526351929, "learning_rate": 0.0002, "epoch": 3.324617108703773, "step": 4450}, {"loss": 1.5926, "grad_norm": 0.6583784818649292, "learning_rate": 0.0002, "epoch": 3.332088158386253, "step": 4460}, {"loss": 1.4206, "grad_norm": 0.5815939903259277, "learning_rate": 0.0002, "epoch": 3.3395592080687337, "step": 4470}, {"loss": 1.5942, "grad_norm": 1.3342205286026, "learning_rate": 0.0002, "epoch": 3.347030257751214, "step": 4480}, {"loss": 1.484, "grad_norm": 0.6341500878334045, "learning_rate": 0.0002, "epoch": 3.3545013074336945, "step": 4490}, {"loss": 1.5219, "grad_norm": 0.6384079456329346, "learning_rate": 0.0002, "epoch": 3.3619723571161746, "step": 4500}, {"loss": 1.5222, "grad_norm": 0.6098346710205078, "learning_rate": 0.0002, "epoch": 3.369443406798655, "step": 4510}, {"loss": 1.5475, "grad_norm": 0.5958296656608582, "learning_rate": 0.0002, "epoch": 3.3769144564811358, "step": 4520}, {"loss": 1.5171, "grad_norm": 0.6157881617546082, "learning_rate": 0.0002, "epoch": 3.384385506163616, "step": 4530}, {"loss": 1.569, "grad_norm": 0.5671007037162781, "learning_rate": 0.0002, "epoch": 3.3918565558460965, "step": 4540}, {"loss": 1.604, "grad_norm": 0.6203294992446899, "learning_rate": 0.0002, "epoch": 3.3993276055285766, "step": 4550}, {"loss": 1.5364, "grad_norm": 0.6743317246437073, "learning_rate": 0.0002, "epoch": 3.406798655211057, "step": 4560}, {"loss": 1.5034, "grad_norm": 0.731765627861023, "learning_rate": 0.0002, "epoch": 3.4142697048935373, "step": 4570}, {"loss": 1.4585, "grad_norm": 0.6285187602043152, "learning_rate": 0.0002, "epoch": 3.421740754576018, "step": 4580}, {"loss": 1.5296, "grad_norm": 0.612680196762085, "learning_rate": 0.0002, "epoch": 3.4292118042584985, "step": 4590}, {"loss": 1.5577, "grad_norm": 0.6413681507110596, "learning_rate": 0.0002, "epoch": 3.4366828539409786, "step": 4600}, {"loss": 1.5026, "grad_norm": 0.6240990161895752, "learning_rate": 0.0002, "epoch": 3.444153903623459, "step": 4610}, {"loss": 1.5887, "grad_norm": 0.5095735192298889, "learning_rate": 0.0002, "epoch": 3.4516249533059393, "step": 4620}, {"loss": 1.4906, "grad_norm": 0.5699611902236938, "learning_rate": 0.0002, "epoch": 3.45909600298842, "step": 4630}, {"loss": 1.5176, "grad_norm": 0.7289775609970093, "learning_rate": 0.0002, "epoch": 3.4665670526709, "step": 4640}, {"loss": 1.5467, "grad_norm": 0.6211609840393066, "learning_rate": 0.0002, "epoch": 3.4740381023533806, "step": 4650}, {"loss": 1.533, "grad_norm": 0.5714802145957947, "learning_rate": 0.0002, "epoch": 3.481509152035861, "step": 4660}, {"loss": 1.5096, "grad_norm": 0.6287049651145935, "learning_rate": 0.0002, "epoch": 3.4889802017183413, "step": 4670}, {"loss": 1.4212, "grad_norm": 0.5480595827102661, "learning_rate": 0.0002, "epoch": 3.496451251400822, "step": 4680}, {"loss": 1.4746, "grad_norm": 0.5683253407478333, "learning_rate": 0.0002, "epoch": 3.503922301083302, "step": 4690}, {"loss": 1.5012, "grad_norm": 0.601140558719635, "learning_rate": 0.0002, "epoch": 3.5113933507657826, "step": 4700}, {"loss": 1.5383, "grad_norm": 0.5344498157501221, "learning_rate": 0.0002, "epoch": 3.5188644004482628, "step": 4710}, {"loss": 1.5428, "grad_norm": 0.5739690661430359, "learning_rate": 0.0002, "epoch": 3.5263354501307433, "step": 4720}, {"loss": 1.5589, "grad_norm": 0.5640085935592651, "learning_rate": 0.0002, "epoch": 3.533806499813224, "step": 4730}, {"loss": 1.487, "grad_norm": 0.5967805981636047, "learning_rate": 0.0002, "epoch": 3.541277549495704, "step": 4740}, {"loss": 1.5461, "grad_norm": 0.6138835549354553, "learning_rate": 0.0002, "epoch": 3.5487485991781846, "step": 4750}, {"loss": 1.5502, "grad_norm": 0.6779900193214417, "learning_rate": 0.0002, "epoch": 3.5562196488606648, "step": 4760}, {"loss": 1.4917, "grad_norm": 0.6122010350227356, "learning_rate": 0.0002, "epoch": 3.5636906985431454, "step": 4770}, {"loss": 1.5405, "grad_norm": 0.5685241222381592, "learning_rate": 0.0002, "epoch": 3.5711617482256255, "step": 4780}, {"loss": 1.5427, "grad_norm": 0.604583203792572, "learning_rate": 0.0002, "epoch": 3.578632797908106, "step": 4790}, {"loss": 1.4514, "grad_norm": 0.651165246963501, "learning_rate": 0.0002, "epoch": 3.5861038475905866, "step": 4800}, {"loss": 1.4109, "grad_norm": 0.6398511528968811, "learning_rate": 0.0002, "epoch": 3.593574897273067, "step": 4810}, {"loss": 1.4261, "grad_norm": 0.6444641351699829, "learning_rate": 0.0002, "epoch": 3.6010459469555474, "step": 4820}, {"loss": 1.5274, "grad_norm": 0.6018481850624084, "learning_rate": 0.0002, "epoch": 3.6085169966380275, "step": 4830}, {"loss": 1.4647, "grad_norm": 0.6025291085243225, "learning_rate": 0.0002, "epoch": 3.615988046320508, "step": 4840}, {"loss": 1.5609, "grad_norm": 0.6810156106948853, "learning_rate": 0.0002, "epoch": 3.623459096002988, "step": 4850}, {"loss": 1.5299, "grad_norm": 0.6408044695854187, "learning_rate": 0.0002, "epoch": 3.630930145685469, "step": 4860}, {"loss": 1.5366, "grad_norm": 0.5608272552490234, "learning_rate": 0.0002, "epoch": 3.6384011953679494, "step": 4870}, {"loss": 1.5188, "grad_norm": 0.6136814951896667, "learning_rate": 0.0002, "epoch": 3.6458722450504295, "step": 4880}, {"loss": 1.5021, "grad_norm": 0.5927900075912476, "learning_rate": 0.0002, "epoch": 3.65334329473291, "step": 4890}, {"loss": 1.6084, "grad_norm": 0.5336901545524597, "learning_rate": 0.0002, "epoch": 3.66081434441539, "step": 4900}, {"loss": 1.5701, "grad_norm": 0.7823320627212524, "learning_rate": 0.0002, "epoch": 3.668285394097871, "step": 4910}, {"loss": 1.4881, "grad_norm": 0.6703504323959351, "learning_rate": 0.0002, "epoch": 3.675756443780351, "step": 4920}, {"loss": 1.5332, "grad_norm": 0.6061160564422607, "learning_rate": 0.0002, "epoch": 3.6832274934628315, "step": 4930}, {"loss": 1.5405, "grad_norm": 0.6237227916717529, "learning_rate": 0.0002, "epoch": 3.690698543145312, "step": 4940}, {"loss": 1.497, "grad_norm": 0.5985278487205505, "learning_rate": 0.0002, "epoch": 3.6981695928277922, "step": 4950}, {"loss": 1.5132, "grad_norm": 0.6483839750289917, "learning_rate": 0.0002, "epoch": 3.705640642510273, "step": 4960}, {"loss": 1.5338, "grad_norm": 0.5788805484771729, "learning_rate": 0.0002, "epoch": 3.713111692192753, "step": 4970}, {"loss": 1.5258, "grad_norm": 0.5609974265098572, "learning_rate": 0.0002, "epoch": 3.7205827418752335, "step": 4980}, {"loss": 1.4759, "grad_norm": 0.5681300759315491, "learning_rate": 0.0002, "epoch": 3.7280537915577137, "step": 4990}, {"loss": 1.6018, "grad_norm": 0.5860186219215393, "learning_rate": 0.0002, "epoch": 3.7355248412401942, "step": 5000}, {"loss": 1.58, "grad_norm": 0.5718157291412354, "learning_rate": 0.0002, "epoch": 3.742995890922675, "step": 5010}, {"loss": 1.5834, "grad_norm": 0.6173721551895142, "learning_rate": 0.0002, "epoch": 3.750466940605155, "step": 5020}, {"loss": 1.5617, "grad_norm": 0.629152238368988, "learning_rate": 0.0002, "epoch": 3.7579379902876355, "step": 5030}, {"loss": 1.519, "grad_norm": 0.5666284561157227, "learning_rate": 0.0002, "epoch": 3.7654090399701157, "step": 5040}, {"loss": 1.5329, "grad_norm": 0.6053005456924438, "learning_rate": 0.0002, "epoch": 3.7728800896525962, "step": 5050}, {"loss": 1.5404, "grad_norm": 0.5870583057403564, "learning_rate": 0.0002, "epoch": 3.7803511393350764, "step": 5060}, {"loss": 1.4444, "grad_norm": 0.5422009229660034, "learning_rate": 0.0002, "epoch": 3.787822189017557, "step": 5070}, {"loss": 1.5308, "grad_norm": 0.5396918058395386, "learning_rate": 0.0002, "epoch": 3.7952932387000375, "step": 5080}, {"loss": 1.464, "grad_norm": 0.5544713139533997, "learning_rate": 0.0002, "epoch": 3.8027642883825177, "step": 5090}, {"loss": 1.4752, "grad_norm": 0.5983749628067017, "learning_rate": 0.0002, "epoch": 3.8102353380649983, "step": 5100}, {"loss": 1.4972, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 3.8177063877474784, "step": 5110}, {"loss": 1.5471, "grad_norm": 0.5436882376670837, "learning_rate": 0.0002, "epoch": 3.825177437429959, "step": 5120}, {"loss": 1.5118, "grad_norm": 0.5453617572784424, "learning_rate": 0.0002, "epoch": 3.832648487112439, "step": 5130}, {"loss": 1.5732, "grad_norm": 0.6269069314002991, "learning_rate": 0.0002, "epoch": 3.8401195367949197, "step": 5140}, {"loss": 1.4959, "grad_norm": 0.6189185380935669, "learning_rate": 0.0002, "epoch": 3.8475905864774003, "step": 5150}, {"loss": 1.4999, "grad_norm": 0.6653388142585754, "learning_rate": 0.0002, "epoch": 3.8550616361598804, "step": 5160}, {"loss": 1.5075, "grad_norm": 0.5771768689155579, "learning_rate": 0.0002, "epoch": 3.862532685842361, "step": 5170}, {"loss": 1.5545, "grad_norm": 0.6052790880203247, "learning_rate": 0.0002, "epoch": 3.870003735524841, "step": 5180}, {"loss": 1.4987, "grad_norm": 0.6572316884994507, "learning_rate": 0.0002, "epoch": 3.8774747852073217, "step": 5190}, {"loss": 1.5241, "grad_norm": 0.670576810836792, "learning_rate": 0.0002, "epoch": 3.884945834889802, "step": 5200}, {"loss": 1.4777, "grad_norm": 0.5728798508644104, "learning_rate": 0.0002, "epoch": 3.8924168845722824, "step": 5210}, {"loss": 1.5351, "grad_norm": 0.6340774297714233, "learning_rate": 0.0002, "epoch": 3.899887934254763, "step": 5220}, {"loss": 1.5081, "grad_norm": 0.5981315970420837, "learning_rate": 0.0002, "epoch": 3.907358983937243, "step": 5230}, {"loss": 1.4875, "grad_norm": 0.6212025880813599, "learning_rate": 0.0002, "epoch": 3.9148300336197237, "step": 5240}, {"loss": 1.5545, "grad_norm": 0.6202296018600464, "learning_rate": 0.0002, "epoch": 3.922301083302204, "step": 5250}, {"loss": 1.5765, "grad_norm": 0.6159142255783081, "learning_rate": 0.0002, "epoch": 3.9297721329846844, "step": 5260}, {"loss": 1.4938, "grad_norm": 0.6519438624382019, "learning_rate": 0.0002, "epoch": 3.9372431826671646, "step": 5270}, {"loss": 1.4859, "grad_norm": 0.539813756942749, "learning_rate": 0.0002, "epoch": 3.944714232349645, "step": 5280}, {"loss": 1.5921, "grad_norm": 0.6443665027618408, "learning_rate": 0.0002, "epoch": 3.9521852820321257, "step": 5290}, {"loss": 1.5153, "grad_norm": 0.6635757684707642, "learning_rate": 0.0002, "epoch": 3.959656331714606, "step": 5300}, {"loss": 1.5485, "grad_norm": 0.589363157749176, "learning_rate": 0.0002, "epoch": 3.9671273813970864, "step": 5310}, {"loss": 1.5498, "grad_norm": 0.5788735747337341, "learning_rate": 0.0002, "epoch": 3.9745984310795666, "step": 5320}, {"loss": 1.5607, "grad_norm": 0.5976864695549011, "learning_rate": 0.0002, "epoch": 3.982069480762047, "step": 5330}, {"loss": 1.5302, "grad_norm": 0.6624067425727844, "learning_rate": 0.0002, "epoch": 3.9895405304445273, "step": 5340}, {"loss": 1.5904, "grad_norm": 0.6738956570625305, "learning_rate": 0.0002, "epoch": 3.997011580127008, "step": 5350}]} +{"epoch": 4.999626447515876, "step": 6692, "epoch_duration": 1697.2771589756012, "total_accumulated_duration": 7520.807319164276, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6181, "grad_norm": 0.4912872612476349, "learning_rate": 0.0002, "epoch": 0.007471049682480389, "step": 10}, {"loss": 2.2606, "grad_norm": 0.4856316149234772, "learning_rate": 0.0002, "epoch": 0.014942099364960777, "step": 20}, {"loss": 2.0957, "grad_norm": 0.47683125734329224, "learning_rate": 0.0002, "epoch": 0.022413149047441166, "step": 30}, {"loss": 1.8908, "grad_norm": 0.515082597732544, "learning_rate": 0.0002, "epoch": 0.029884198729921554, "step": 40}, {"loss": 1.9704, "grad_norm": 0.5299215316772461, "learning_rate": 0.0002, "epoch": 0.03735524841240194, "step": 50}, {"loss": 1.9225, "grad_norm": 0.4951399862766266, "learning_rate": 0.0002, "epoch": 0.04482629809488233, "step": 60}, {"loss": 1.9742, "grad_norm": 0.48079821467399597, "learning_rate": 0.0002, "epoch": 0.05229734777736272, "step": 70}, {"loss": 1.9466, "grad_norm": 0.49402132630348206, "learning_rate": 0.0002, "epoch": 0.05976839745984311, "step": 80}, {"loss": 1.8691, "grad_norm": 0.4778193235397339, "learning_rate": 0.0002, "epoch": 0.0672394471423235, "step": 90}, {"loss": 1.8455, "grad_norm": 0.42472657561302185, "learning_rate": 0.0002, "epoch": 0.07471049682480388, "step": 100}, {"loss": 1.8744, "grad_norm": 0.4433092474937439, "learning_rate": 0.0002, "epoch": 0.08218154650728428, "step": 110}, {"loss": 1.865, "grad_norm": 0.4472862780094147, "learning_rate": 0.0002, "epoch": 0.08965259618976466, "step": 120}, {"loss": 1.9256, "grad_norm": 0.42596298456192017, "learning_rate": 0.0002, "epoch": 0.09712364587224505, "step": 130}, {"loss": 1.8015, "grad_norm": 0.46645811200141907, "learning_rate": 0.0002, "epoch": 0.10459469555472543, "step": 140}, {"loss": 1.8307, "grad_norm": 0.41041234135627747, "learning_rate": 0.0002, "epoch": 0.11206574523720583, "step": 150}, {"loss": 1.8276, "grad_norm": 0.5329819917678833, "learning_rate": 0.0002, "epoch": 0.11953679491968622, "step": 160}, {"loss": 1.8118, "grad_norm": 0.4065922200679779, "learning_rate": 0.0002, "epoch": 0.1270078446021666, "step": 170}, {"loss": 1.8559, "grad_norm": 0.38406994938850403, "learning_rate": 0.0002, "epoch": 0.134478894284647, "step": 180}, {"loss": 1.8647, "grad_norm": 0.4246881306171417, "learning_rate": 0.0002, "epoch": 0.14194994396712737, "step": 190}, {"loss": 1.8054, "grad_norm": 0.35136649012565613, "learning_rate": 0.0002, "epoch": 0.14942099364960776, "step": 200}, {"loss": 1.802, "grad_norm": 0.43252742290496826, "learning_rate": 0.0002, "epoch": 0.15689204333208817, "step": 210}, {"loss": 1.7823, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002, "epoch": 0.16436309301456856, "step": 220}, {"loss": 1.818, "grad_norm": 0.3748249113559723, "learning_rate": 0.0002, "epoch": 0.17183414269704894, "step": 230}, {"loss": 1.866, "grad_norm": 0.6432855725288391, "learning_rate": 0.0002, "epoch": 0.17930519237952933, "step": 240}, {"loss": 1.8397, "grad_norm": 0.34874802827835083, "learning_rate": 0.0002, "epoch": 0.1867762420620097, "step": 250}, {"loss": 1.79, "grad_norm": 0.3721984326839447, "learning_rate": 0.0002, "epoch": 0.1942472917444901, "step": 260}, {"loss": 1.8464, "grad_norm": 0.4339311420917511, "learning_rate": 0.0002, "epoch": 0.20171834142697048, "step": 270}, {"loss": 1.8665, "grad_norm": 0.4018215537071228, "learning_rate": 0.0002, "epoch": 0.20918939110945087, "step": 280}, {"loss": 1.8048, "grad_norm": 0.3278839886188507, "learning_rate": 0.0002, "epoch": 0.21666044079193125, "step": 290}, {"loss": 1.7395, "grad_norm": 0.36146077513694763, "learning_rate": 0.0002, "epoch": 0.22413149047441167, "step": 300}, {"loss": 1.7916, "grad_norm": 0.38175010681152344, "learning_rate": 0.0002, "epoch": 0.23160254015689205, "step": 310}, {"loss": 1.8593, "grad_norm": 0.44776618480682373, "learning_rate": 0.0002, "epoch": 0.23907358983937244, "step": 320}, {"loss": 1.7824, "grad_norm": 0.3933652937412262, "learning_rate": 0.0002, "epoch": 0.24654463952185282, "step": 330}, {"loss": 1.8393, "grad_norm": 0.3515005111694336, "learning_rate": 0.0002, "epoch": 0.2540156892043332, "step": 340}, {"loss": 1.8653, "grad_norm": 0.6683304309844971, "learning_rate": 0.0002, "epoch": 0.2614867388868136, "step": 350}, {"loss": 1.8797, "grad_norm": 0.37093454599380493, "learning_rate": 0.0002, "epoch": 0.268957788569294, "step": 360}, {"loss": 1.8251, "grad_norm": 0.3450651168823242, "learning_rate": 0.0002, "epoch": 0.2764288382517744, "step": 370}, {"loss": 1.7435, "grad_norm": 0.5140917301177979, "learning_rate": 0.0002, "epoch": 0.28389988793425475, "step": 380}, {"loss": 1.8026, "grad_norm": 0.32885563373565674, "learning_rate": 0.0002, "epoch": 0.29137093761673516, "step": 390}, {"loss": 1.8174, "grad_norm": 0.33962297439575195, "learning_rate": 0.0002, "epoch": 0.2988419872992155, "step": 400}, {"loss": 1.7467, "grad_norm": 0.3723141849040985, "learning_rate": 0.0002, "epoch": 0.30631303698169593, "step": 410}, {"loss": 1.8459, "grad_norm": 0.37173134088516235, "learning_rate": 0.0002, "epoch": 0.31378408666417634, "step": 420}, {"loss": 1.8876, "grad_norm": 0.33736956119537354, "learning_rate": 0.0002, "epoch": 0.3212551363466567, "step": 430}, {"loss": 1.8367, "grad_norm": 0.3602448105812073, "learning_rate": 0.0002, "epoch": 0.3287261860291371, "step": 440}, {"loss": 1.8058, "grad_norm": 0.3569699227809906, "learning_rate": 0.0002, "epoch": 0.33619723571161747, "step": 450}, {"loss": 1.8086, "grad_norm": 0.31009167432785034, "learning_rate": 0.0002, "epoch": 0.3436682853940979, "step": 460}, {"loss": 1.8876, "grad_norm": 0.5278693437576294, "learning_rate": 0.0002, "epoch": 0.35113933507657824, "step": 470}, {"loss": 1.8534, "grad_norm": 0.3587537109851837, "learning_rate": 0.0002, "epoch": 0.35861038475905865, "step": 480}, {"loss": 1.8046, "grad_norm": 0.3859670162200928, "learning_rate": 0.0002, "epoch": 0.366081434441539, "step": 490}, {"loss": 1.8287, "grad_norm": 0.395913690328598, "learning_rate": 0.0002, "epoch": 0.3735524841240194, "step": 500}, {"loss": 1.7619, "grad_norm": 0.35052940249443054, "learning_rate": 0.0002, "epoch": 0.38102353380649984, "step": 510}, {"loss": 1.7824, "grad_norm": 0.2979494333267212, "learning_rate": 0.0002, "epoch": 0.3884945834889802, "step": 520}, {"loss": 1.8641, "grad_norm": 0.3062683343887329, "learning_rate": 0.0002, "epoch": 0.3959656331714606, "step": 530}, {"loss": 1.7651, "grad_norm": 0.3172847330570221, "learning_rate": 0.0002, "epoch": 0.40343668285394096, "step": 540}, {"loss": 1.806, "grad_norm": 0.360435426235199, "learning_rate": 0.0002, "epoch": 0.4109077325364214, "step": 550}, {"loss": 1.9054, "grad_norm": 0.3427872359752655, "learning_rate": 0.0002, "epoch": 0.41837878221890173, "step": 560}, {"loss": 1.7562, "grad_norm": 0.34036558866500854, "learning_rate": 0.0002, "epoch": 0.42584983190138215, "step": 570}, {"loss": 1.7254, "grad_norm": 0.3365345299243927, "learning_rate": 0.0002, "epoch": 0.4333208815838625, "step": 580}, {"loss": 1.8328, "grad_norm": 0.35619041323661804, "learning_rate": 0.0002, "epoch": 0.4407919312663429, "step": 590}, {"loss": 1.8114, "grad_norm": 0.3569088280200958, "learning_rate": 0.0002, "epoch": 0.44826298094882333, "step": 600}, {"loss": 1.8599, "grad_norm": 0.3581278622150421, "learning_rate": 0.0002, "epoch": 0.4557340306313037, "step": 610}, {"loss": 1.7078, "grad_norm": 0.43197110295295715, "learning_rate": 0.0002, "epoch": 0.4632050803137841, "step": 620}, {"loss": 1.8257, "grad_norm": 0.33966198563575745, "learning_rate": 0.0002, "epoch": 0.47067612999626446, "step": 630}, {"loss": 1.7528, "grad_norm": 0.3343866467475891, "learning_rate": 0.0002, "epoch": 0.47814717967874487, "step": 640}, {"loss": 1.8191, "grad_norm": 0.33878564834594727, "learning_rate": 0.0002, "epoch": 0.48561822936122523, "step": 650}, {"loss": 1.8801, "grad_norm": 0.387195885181427, "learning_rate": 0.0002, "epoch": 0.49308927904370564, "step": 660}, {"loss": 1.7559, "grad_norm": 0.3755440413951874, "learning_rate": 0.0002, "epoch": 0.500560328726186, "step": 670}, {"loss": 1.8057, "grad_norm": 0.3272816836833954, "learning_rate": 0.0002, "epoch": 0.5080313784086664, "step": 680}, {"loss": 1.8156, "grad_norm": 0.36063864827156067, "learning_rate": 0.0002, "epoch": 0.5155024280911468, "step": 690}, {"loss": 1.8397, "grad_norm": 0.35317373275756836, "learning_rate": 0.0002, "epoch": 0.5229734777736272, "step": 700}, {"loss": 1.7603, "grad_norm": 0.3561195433139801, "learning_rate": 0.0002, "epoch": 0.5304445274561076, "step": 710}, {"loss": 1.8149, "grad_norm": 0.31124624609947205, "learning_rate": 0.0002, "epoch": 0.537915577138588, "step": 720}, {"loss": 1.7434, "grad_norm": 0.3294544517993927, "learning_rate": 0.0002, "epoch": 0.5453866268210683, "step": 730}, {"loss": 1.8027, "grad_norm": 0.31933900713920593, "learning_rate": 0.0002, "epoch": 0.5528576765035488, "step": 740}, {"loss": 1.7601, "grad_norm": 0.3226020634174347, "learning_rate": 0.0002, "epoch": 0.5603287261860291, "step": 750}, {"loss": 1.7862, "grad_norm": 0.3147525489330292, "learning_rate": 0.0002, "epoch": 0.5677997758685095, "step": 760}, {"loss": 1.9028, "grad_norm": 0.32234328985214233, "learning_rate": 0.0002, "epoch": 0.57527082555099, "step": 770}, {"loss": 1.7623, "grad_norm": 0.3258664309978485, "learning_rate": 0.0002, "epoch": 0.5827418752334703, "step": 780}, {"loss": 1.7384, "grad_norm": 0.3166961967945099, "learning_rate": 0.0002, "epoch": 0.5902129249159507, "step": 790}, {"loss": 1.8799, "grad_norm": 0.35621458292007446, "learning_rate": 0.0002, "epoch": 0.597683974598431, "step": 800}, {"loss": 1.8313, "grad_norm": 0.3236999213695526, "learning_rate": 0.0002, "epoch": 0.6051550242809115, "step": 810}, {"loss": 1.7132, "grad_norm": 0.2892923653125763, "learning_rate": 0.0002, "epoch": 0.6126260739633919, "step": 820}, {"loss": 1.8709, "grad_norm": 0.4098321497440338, "learning_rate": 0.0002, "epoch": 0.6200971236458722, "step": 830}, {"loss": 1.7637, "grad_norm": 0.3337118923664093, "learning_rate": 0.0002, "epoch": 0.6275681733283527, "step": 840}, {"loss": 1.7375, "grad_norm": 0.30416029691696167, "learning_rate": 0.0002, "epoch": 0.635039223010833, "step": 850}, {"loss": 1.7419, "grad_norm": 0.3361026346683502, "learning_rate": 0.0002, "epoch": 0.6425102726933134, "step": 860}, {"loss": 1.732, "grad_norm": 0.3537365198135376, "learning_rate": 0.0002, "epoch": 0.6499813223757938, "step": 870}, {"loss": 1.7825, "grad_norm": 0.33854469656944275, "learning_rate": 0.0002, "epoch": 0.6574523720582742, "step": 880}, {"loss": 1.7561, "grad_norm": 0.3332272469997406, "learning_rate": 0.0002, "epoch": 0.6649234217407546, "step": 890}, {"loss": 1.7247, "grad_norm": 0.34954726696014404, "learning_rate": 0.0002, "epoch": 0.6723944714232349, "step": 900}, {"loss": 1.7917, "grad_norm": 0.2921750247478485, "learning_rate": 0.0002, "epoch": 0.6798655211057153, "step": 910}, {"loss": 1.7807, "grad_norm": 0.30508682131767273, "learning_rate": 0.0002, "epoch": 0.6873365707881958, "step": 920}, {"loss": 1.8082, "grad_norm": 0.32268425822257996, "learning_rate": 0.0002, "epoch": 0.6948076204706761, "step": 930}, {"loss": 1.8283, "grad_norm": 0.2844390869140625, "learning_rate": 0.0002, "epoch": 0.7022786701531565, "step": 940}, {"loss": 1.7363, "grad_norm": 0.31263890862464905, "learning_rate": 0.0002, "epoch": 0.709749719835637, "step": 950}, {"loss": 1.8081, "grad_norm": 0.3626808822154999, "learning_rate": 0.0002, "epoch": 0.7172207695181173, "step": 960}, {"loss": 1.853, "grad_norm": 0.3322749733924866, "learning_rate": 0.0002, "epoch": 0.7246918192005977, "step": 970}, {"loss": 1.7912, "grad_norm": 0.29177871346473694, "learning_rate": 0.0002, "epoch": 0.732162868883078, "step": 980}, {"loss": 1.8447, "grad_norm": 0.35405513644218445, "learning_rate": 0.0002, "epoch": 0.7396339185655585, "step": 990}, {"loss": 1.7008, "grad_norm": 0.39318400621414185, "learning_rate": 0.0002, "epoch": 0.7471049682480388, "step": 1000}, {"loss": 1.7803, "grad_norm": 0.29401418566703796, "learning_rate": 0.0002, "epoch": 0.7545760179305192, "step": 1010}, {"loss": 1.7649, "grad_norm": 0.3271748721599579, "learning_rate": 0.0002, "epoch": 0.7620470676129997, "step": 1020}, {"loss": 1.7266, "grad_norm": 0.30883970856666565, "learning_rate": 0.0002, "epoch": 0.76951811729548, "step": 1030}, {"loss": 1.7722, "grad_norm": 0.3411838412284851, "learning_rate": 0.0002, "epoch": 0.7769891669779604, "step": 1040}, {"loss": 1.829, "grad_norm": 0.30608129501342773, "learning_rate": 0.0002, "epoch": 0.7844602166604407, "step": 1050}, {"loss": 1.7815, "grad_norm": 0.30899080634117126, "learning_rate": 0.0002, "epoch": 0.7919312663429212, "step": 1060}, {"loss": 1.7625, "grad_norm": 0.3160453140735626, "learning_rate": 0.0002, "epoch": 0.7994023160254016, "step": 1070}, {"loss": 1.8452, "grad_norm": 0.30947187542915344, "learning_rate": 0.0002, "epoch": 0.8068733657078819, "step": 1080}, {"loss": 1.7418, "grad_norm": 0.3103134036064148, "learning_rate": 0.0002, "epoch": 0.8143444153903624, "step": 1090}, {"loss": 1.842, "grad_norm": 0.31771138310432434, "learning_rate": 0.0002, "epoch": 0.8218154650728428, "step": 1100}, {"loss": 1.7918, "grad_norm": 0.5860997438430786, "learning_rate": 0.0002, "epoch": 0.8292865147553231, "step": 1110}, {"loss": 1.8443, "grad_norm": 0.3230148255825043, "learning_rate": 0.0002, "epoch": 0.8367575644378035, "step": 1120}, {"loss": 1.8478, "grad_norm": 0.29611510038375854, "learning_rate": 0.0002, "epoch": 0.8442286141202839, "step": 1130}, {"loss": 1.7673, "grad_norm": 0.3373654782772064, "learning_rate": 0.0002, "epoch": 0.8516996638027643, "step": 1140}, {"loss": 1.7997, "grad_norm": 0.3474279046058655, "learning_rate": 0.0002, "epoch": 0.8591707134852447, "step": 1150}, {"loss": 1.75, "grad_norm": 0.35057875514030457, "learning_rate": 0.0002, "epoch": 0.866641763167725, "step": 1160}, {"loss": 1.8273, "grad_norm": 0.39537495374679565, "learning_rate": 0.0002, "epoch": 0.8741128128502055, "step": 1170}, {"loss": 1.7682, "grad_norm": 0.3714233636856079, "learning_rate": 0.0002, "epoch": 0.8815838625326858, "step": 1180}, {"loss": 1.7549, "grad_norm": 0.2950296998023987, "learning_rate": 0.0002, "epoch": 0.8890549122151662, "step": 1190}, {"loss": 1.7612, "grad_norm": 0.38182979822158813, "learning_rate": 0.0002, "epoch": 0.8965259618976467, "step": 1200}, {"loss": 1.827, "grad_norm": 0.27883678674697876, "learning_rate": 0.0002, "epoch": 0.903997011580127, "step": 1210}, {"loss": 1.7623, "grad_norm": 0.33874374628067017, "learning_rate": 0.0002, "epoch": 0.9114680612626074, "step": 1220}, {"loss": 1.7334, "grad_norm": 0.3014272153377533, "learning_rate": 0.0002, "epoch": 0.9189391109450877, "step": 1230}, {"loss": 1.8235, "grad_norm": 0.3194271922111511, "learning_rate": 0.0002, "epoch": 0.9264101606275682, "step": 1240}, {"loss": 1.7924, "grad_norm": 0.3049403429031372, "learning_rate": 0.0002, "epoch": 0.9338812103100486, "step": 1250}, {"loss": 1.7535, "grad_norm": 0.30621254444122314, "learning_rate": 0.0002, "epoch": 0.9413522599925289, "step": 1260}, {"loss": 1.8287, "grad_norm": 0.28675132989883423, "learning_rate": 0.0002, "epoch": 0.9488233096750094, "step": 1270}, {"loss": 1.7586, "grad_norm": 0.3322032690048218, "learning_rate": 0.0002, "epoch": 0.9562943593574897, "step": 1280}, {"loss": 1.8054, "grad_norm": 0.35408294200897217, "learning_rate": 0.0002, "epoch": 0.9637654090399701, "step": 1290}, {"loss": 1.7343, "grad_norm": 0.36386919021606445, "learning_rate": 0.0002, "epoch": 0.9712364587224505, "step": 1300}, {"loss": 1.8633, "grad_norm": 0.32338324189186096, "learning_rate": 0.0002, "epoch": 0.9787075084049309, "step": 1310}, {"loss": 1.7724, "grad_norm": 0.3714013993740082, "learning_rate": 0.0002, "epoch": 0.9861785580874113, "step": 1320}, {"loss": 1.7766, "grad_norm": 0.3133082389831543, "learning_rate": 0.0002, "epoch": 0.9936496077698916, "step": 1330}, {"eval_loss": 1.8051470518112183, "eval_runtime": 38.6332, "eval_samples_per_second": 13.331, "eval_steps_per_second": 1.682, "epoch": 0.9996264475158759, "step": 1338}, {"loss": 1.8035, "grad_norm": 0.31595754623413086, "learning_rate": 0.0002, "epoch": 1.001120657452372, "step": 1340}, {"loss": 1.7486, "grad_norm": 0.3095700144767761, "learning_rate": 0.0002, "epoch": 1.0085917071348525, "step": 1350}, {"loss": 1.6981, "grad_norm": 0.34677496552467346, "learning_rate": 0.0002, "epoch": 1.0160627568173328, "step": 1360}, {"loss": 1.7377, "grad_norm": 0.29108840227127075, "learning_rate": 0.0002, "epoch": 1.0235338064998132, "step": 1370}, {"loss": 1.7194, "grad_norm": 0.32356950640678406, "learning_rate": 0.0002, "epoch": 1.0310048561822935, "step": 1380}, {"loss": 1.7593, "grad_norm": 0.4200669229030609, "learning_rate": 0.0002, "epoch": 1.038475905864774, "step": 1390}, {"loss": 1.797, "grad_norm": 0.3283711373806, "learning_rate": 0.0002, "epoch": 1.0459469555472545, "step": 1400}, {"loss": 1.7163, "grad_norm": 0.32898256182670593, "learning_rate": 0.0002, "epoch": 1.0534180052297348, "step": 1410}, {"loss": 1.7559, "grad_norm": 0.38790300488471985, "learning_rate": 0.0002, "epoch": 1.0608890549122152, "step": 1420}, {"loss": 1.6922, "grad_norm": 0.339800089597702, "learning_rate": 0.0002, "epoch": 1.0683601045946955, "step": 1430}, {"loss": 1.7076, "grad_norm": 0.3548751175403595, "learning_rate": 0.0002, "epoch": 1.075831154277176, "step": 1440}, {"loss": 1.6985, "grad_norm": 0.35114359855651855, "learning_rate": 0.0002, "epoch": 1.0833022039596563, "step": 1450}, {"loss": 1.7217, "grad_norm": 0.35226720571517944, "learning_rate": 0.0002, "epoch": 1.0907732536421366, "step": 1460}, {"loss": 1.6822, "grad_norm": 0.33665576577186584, "learning_rate": 0.0002, "epoch": 1.0982443033246172, "step": 1470}, {"loss": 1.6699, "grad_norm": 0.363889217376709, "learning_rate": 0.0002, "epoch": 1.1057153530070976, "step": 1480}, {"loss": 1.7933, "grad_norm": 0.3826201856136322, "learning_rate": 0.0002, "epoch": 1.113186402689578, "step": 1490}, {"loss": 1.7022, "grad_norm": 0.34058740735054016, "learning_rate": 0.0002, "epoch": 1.1206574523720583, "step": 1500}, {"loss": 1.6375, "grad_norm": 0.3462134301662445, "learning_rate": 0.0002, "epoch": 1.1281285020545386, "step": 1510}, {"loss": 1.7147, "grad_norm": 0.3396756052970886, "learning_rate": 0.0002, "epoch": 1.135599551737019, "step": 1520}, {"loss": 1.7219, "grad_norm": 0.32004743814468384, "learning_rate": 0.0002, "epoch": 1.1430706014194993, "step": 1530}, {"loss": 1.743, "grad_norm": 0.3397733271121979, "learning_rate": 0.0002, "epoch": 1.15054165110198, "step": 1540}, {"loss": 1.7333, "grad_norm": 0.3783262073993683, "learning_rate": 0.0002, "epoch": 1.1580127007844603, "step": 1550}, {"loss": 1.6075, "grad_norm": 0.35121291875839233, "learning_rate": 0.0002, "epoch": 1.1654837504669406, "step": 1560}, {"loss": 1.678, "grad_norm": 0.35816895961761475, "learning_rate": 0.0002, "epoch": 1.172954800149421, "step": 1570}, {"loss": 1.7143, "grad_norm": 0.33843839168548584, "learning_rate": 0.0002, "epoch": 1.1804258498319014, "step": 1580}, {"loss": 1.7434, "grad_norm": 0.3371972143650055, "learning_rate": 0.0002, "epoch": 1.1878968995143817, "step": 1590}, {"loss": 1.7671, "grad_norm": 0.36016878485679626, "learning_rate": 0.0002, "epoch": 1.195367949196862, "step": 1600}, {"loss": 1.6914, "grad_norm": 0.40879473090171814, "learning_rate": 0.0002, "epoch": 1.2028389988793426, "step": 1610}, {"loss": 1.6955, "grad_norm": 0.3216715455055237, "learning_rate": 0.0002, "epoch": 1.210310048561823, "step": 1620}, {"loss": 1.632, "grad_norm": 0.4482610821723938, "learning_rate": 0.0002, "epoch": 1.2177810982443034, "step": 1630}, {"loss": 1.6999, "grad_norm": 0.3257700502872467, "learning_rate": 0.0002, "epoch": 1.2252521479267837, "step": 1640}, {"loss": 1.7177, "grad_norm": 0.38646459579467773, "learning_rate": 0.0002, "epoch": 1.232723197609264, "step": 1650}, {"loss": 1.7081, "grad_norm": 0.4081360697746277, "learning_rate": 0.0002, "epoch": 1.2401942472917444, "step": 1660}, {"loss": 1.7519, "grad_norm": 0.4326848089694977, "learning_rate": 0.0002, "epoch": 1.2476652969742248, "step": 1670}, {"loss": 1.6752, "grad_norm": 0.346401572227478, "learning_rate": 0.0002, "epoch": 1.2551363466567054, "step": 1680}, {"loss": 1.7425, "grad_norm": 0.34536251425743103, "learning_rate": 0.0002, "epoch": 1.2626073963391857, "step": 1690}, {"loss": 1.7061, "grad_norm": 0.41359591484069824, "learning_rate": 0.0002, "epoch": 1.270078446021666, "step": 1700}, {"loss": 1.7906, "grad_norm": 0.3530874252319336, "learning_rate": 0.0002, "epoch": 1.2775494957041464, "step": 1710}, {"loss": 1.7357, "grad_norm": 0.3702719211578369, "learning_rate": 0.0002, "epoch": 1.2850205453866268, "step": 1720}, {"loss": 1.766, "grad_norm": 0.3703329563140869, "learning_rate": 0.0002, "epoch": 1.2924915950691072, "step": 1730}, {"loss": 1.7221, "grad_norm": 0.37919729948043823, "learning_rate": 0.0002, "epoch": 1.2999626447515875, "step": 1740}, {"loss": 1.7859, "grad_norm": 0.32526856660842896, "learning_rate": 0.0002, "epoch": 1.307433694434068, "step": 1750}, {"loss": 1.7117, "grad_norm": 0.36752620339393616, "learning_rate": 0.0002, "epoch": 1.3149047441165485, "step": 1760}, {"loss": 1.7335, "grad_norm": 0.3398192524909973, "learning_rate": 0.0002, "epoch": 1.3223757937990288, "step": 1770}, {"loss": 1.7492, "grad_norm": 0.37435585260391235, "learning_rate": 0.0002, "epoch": 1.3298468434815092, "step": 1780}, {"loss": 1.7393, "grad_norm": 0.35793280601501465, "learning_rate": 0.0002, "epoch": 1.3373178931639895, "step": 1790}, {"loss": 1.7266, "grad_norm": 0.35481882095336914, "learning_rate": 0.0002, "epoch": 1.3447889428464699, "step": 1800}, {"loss": 1.7456, "grad_norm": 0.3786393105983734, "learning_rate": 0.0002, "epoch": 1.3522599925289502, "step": 1810}, {"loss": 1.7169, "grad_norm": 0.33245593309402466, "learning_rate": 0.0002, "epoch": 1.3597310422114308, "step": 1820}, {"loss": 1.7577, "grad_norm": 0.35388344526290894, "learning_rate": 0.0002, "epoch": 1.3672020918939112, "step": 1830}, {"loss": 1.6968, "grad_norm": 0.3695325553417206, "learning_rate": 0.0002, "epoch": 1.3746731415763915, "step": 1840}, {"loss": 1.7086, "grad_norm": 0.3683604598045349, "learning_rate": 0.0002, "epoch": 1.382144191258872, "step": 1850}, {"loss": 1.7878, "grad_norm": 0.3753012418746948, "learning_rate": 0.0002, "epoch": 1.3896152409413522, "step": 1860}, {"loss": 1.6969, "grad_norm": 0.3331069350242615, "learning_rate": 0.0002, "epoch": 1.3970862906238326, "step": 1870}, {"loss": 1.6644, "grad_norm": 0.3877500295639038, "learning_rate": 0.0002, "epoch": 1.404557340306313, "step": 1880}, {"loss": 1.7586, "grad_norm": 0.33525151014328003, "learning_rate": 0.0002, "epoch": 1.4120283899887935, "step": 1890}, {"loss": 1.7031, "grad_norm": 0.3697299659252167, "learning_rate": 0.0002, "epoch": 1.4194994396712737, "step": 1900}, {"loss": 1.6956, "grad_norm": 0.4029286205768585, "learning_rate": 0.0002, "epoch": 1.4269704893537543, "step": 1910}, {"loss": 1.6897, "grad_norm": 0.3596203029155731, "learning_rate": 0.0002, "epoch": 1.4344415390362346, "step": 1920}, {"loss": 1.7139, "grad_norm": 0.450783908367157, "learning_rate": 0.0002, "epoch": 1.441912588718715, "step": 1930}, {"loss": 1.7243, "grad_norm": 0.3651481866836548, "learning_rate": 0.0002, "epoch": 1.4493836384011953, "step": 1940}, {"loss": 1.6637, "grad_norm": 0.3608424663543701, "learning_rate": 0.0002, "epoch": 1.4568546880836757, "step": 1950}, {"loss": 1.8285, "grad_norm": 0.39684420824050903, "learning_rate": 0.0002, "epoch": 1.4643257377661563, "step": 1960}, {"loss": 1.7514, "grad_norm": 0.34618663787841797, "learning_rate": 0.0002, "epoch": 1.4717967874486364, "step": 1970}, {"loss": 1.6655, "grad_norm": 0.4150386452674866, "learning_rate": 0.0002, "epoch": 1.479267837131117, "step": 1980}, {"loss": 1.7021, "grad_norm": 0.35500776767730713, "learning_rate": 0.0002, "epoch": 1.4867388868135973, "step": 1990}, {"loss": 1.7322, "grad_norm": 0.344144344329834, "learning_rate": 0.0002, "epoch": 1.4942099364960777, "step": 2000}, {"loss": 1.6998, "grad_norm": 0.3340149223804474, "learning_rate": 0.0002, "epoch": 1.501680986178558, "step": 2010}, {"loss": 1.7508, "grad_norm": 0.37685006856918335, "learning_rate": 0.0002, "epoch": 1.5091520358610384, "step": 2020}, {"loss": 1.8299, "grad_norm": 0.3699876368045807, "learning_rate": 0.0002, "epoch": 1.516623085543519, "step": 2030}, {"loss": 1.7357, "grad_norm": 0.3370307385921478, "learning_rate": 0.0002, "epoch": 1.5240941352259991, "step": 2040}, {"loss": 1.8044, "grad_norm": 0.37780630588531494, "learning_rate": 0.0002, "epoch": 1.5315651849084797, "step": 2050}, {"loss": 1.7408, "grad_norm": 0.370259165763855, "learning_rate": 0.0002, "epoch": 1.53903623459096, "step": 2060}, {"loss": 1.7398, "grad_norm": 0.3440011441707611, "learning_rate": 0.0002, "epoch": 1.5465072842734404, "step": 2070}, {"loss": 1.7105, "grad_norm": 0.40382063388824463, "learning_rate": 0.0002, "epoch": 1.5539783339559208, "step": 2080}, {"loss": 1.7071, "grad_norm": 0.38002029061317444, "learning_rate": 0.0002, "epoch": 1.5614493836384011, "step": 2090}, {"loss": 1.6815, "grad_norm": 0.3658451437950134, "learning_rate": 0.0002, "epoch": 1.5689204333208817, "step": 2100}, {"loss": 1.7598, "grad_norm": 0.354842871427536, "learning_rate": 0.0002, "epoch": 1.5763914830033618, "step": 2110}, {"loss": 1.6898, "grad_norm": 0.34735530614852905, "learning_rate": 0.0002, "epoch": 1.5838625326858424, "step": 2120}, {"loss": 1.7363, "grad_norm": 0.377581924200058, "learning_rate": 0.0002, "epoch": 1.5913335823683228, "step": 2130}, {"loss": 1.7789, "grad_norm": 0.41254034638404846, "learning_rate": 0.0002, "epoch": 1.5988046320508031, "step": 2140}, {"loss": 1.6782, "grad_norm": 0.3630715310573578, "learning_rate": 0.0002, "epoch": 1.6062756817332835, "step": 2150}, {"loss": 1.7531, "grad_norm": 0.36980143189430237, "learning_rate": 0.0002, "epoch": 1.6137467314157639, "step": 2160}, {"loss": 1.6847, "grad_norm": 0.3634769320487976, "learning_rate": 0.0002, "epoch": 1.6212177810982444, "step": 2170}, {"loss": 1.6367, "grad_norm": 0.3794139623641968, "learning_rate": 0.0002, "epoch": 1.6286888307807246, "step": 2180}, {"loss": 1.7064, "grad_norm": 0.359742134809494, "learning_rate": 0.0002, "epoch": 1.6361598804632052, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3770543932914734, "learning_rate": 0.0002, "epoch": 1.6436309301456855, "step": 2200}, {"loss": 1.784, "grad_norm": 0.3797036409378052, "learning_rate": 0.0002, "epoch": 1.6511019798281659, "step": 2210}, {"loss": 1.7875, "grad_norm": 0.35622093081474304, "learning_rate": 0.0002, "epoch": 1.6585730295106462, "step": 2220}, {"loss": 1.6615, "grad_norm": 0.34552520513534546, "learning_rate": 0.0002, "epoch": 1.6660440791931266, "step": 2230}, {"loss": 1.7522, "grad_norm": 0.379926860332489, "learning_rate": 0.0002, "epoch": 1.6735151288756072, "step": 2240}, {"loss": 1.7953, "grad_norm": 0.37083810567855835, "learning_rate": 0.0002, "epoch": 1.6809861785580873, "step": 2250}, {"loss": 1.7485, "grad_norm": 0.42746543884277344, "learning_rate": 0.0002, "epoch": 1.6884572282405679, "step": 2260}, {"loss": 1.776, "grad_norm": 0.3372884690761566, "learning_rate": 0.0002, "epoch": 1.6959282779230482, "step": 2270}, {"loss": 1.7604, "grad_norm": 0.35220256447792053, "learning_rate": 0.0002, "epoch": 1.7033993276055286, "step": 2280}, {"loss": 1.7154, "grad_norm": 0.3659130930900574, "learning_rate": 0.0002, "epoch": 1.710870377288009, "step": 2290}, {"loss": 1.6953, "grad_norm": 0.37629297375679016, "learning_rate": 0.0002, "epoch": 1.7183414269704893, "step": 2300}, {"loss": 1.7212, "grad_norm": 0.36312398314476013, "learning_rate": 0.0002, "epoch": 1.7258124766529699, "step": 2310}, {"loss": 1.7903, "grad_norm": 0.467709481716156, "learning_rate": 0.0002, "epoch": 1.73328352633545, "step": 2320}, {"loss": 1.696, "grad_norm": 0.38685527443885803, "learning_rate": 0.0002, "epoch": 1.7407545760179306, "step": 2330}, {"loss": 1.7041, "grad_norm": 0.3578338325023651, "learning_rate": 0.0002, "epoch": 1.748225625700411, "step": 2340}, {"loss": 1.6456, "grad_norm": 0.36057502031326294, "learning_rate": 0.0002, "epoch": 1.7556966753828913, "step": 2350}, {"loss": 1.6853, "grad_norm": 0.3615196645259857, "learning_rate": 0.0002, "epoch": 1.7631677250653717, "step": 2360}, {"loss": 1.7612, "grad_norm": 0.4118947684764862, "learning_rate": 0.0002, "epoch": 1.770638774747852, "step": 2370}, {"loss": 1.6946, "grad_norm": 0.4067276120185852, "learning_rate": 0.0002, "epoch": 1.7781098244303326, "step": 2380}, {"loss": 1.712, "grad_norm": 0.3979823887348175, "learning_rate": 0.0002, "epoch": 1.7855808741128127, "step": 2390}, {"loss": 1.7644, "grad_norm": 0.44045883417129517, "learning_rate": 0.0002, "epoch": 1.7930519237952933, "step": 2400}, {"loss": 1.7251, "grad_norm": 0.3998069167137146, "learning_rate": 0.0002, "epoch": 1.8005229734777737, "step": 2410}, {"loss": 1.7354, "grad_norm": 0.3450094759464264, "learning_rate": 0.0002, "epoch": 1.807994023160254, "step": 2420}, {"loss": 1.6998, "grad_norm": 0.3759009838104248, "learning_rate": 0.0002, "epoch": 1.8154650728427344, "step": 2430}, {"loss": 1.7706, "grad_norm": 0.34347015619277954, "learning_rate": 0.0002, "epoch": 1.8229361225252148, "step": 2440}, {"loss": 1.7345, "grad_norm": 0.3511228859424591, "learning_rate": 0.0002, "epoch": 1.8304071722076953, "step": 2450}, {"loss": 1.6909, "grad_norm": 0.36853715777397156, "learning_rate": 0.0002, "epoch": 1.8378782218901755, "step": 2460}, {"loss": 1.6931, "grad_norm": 0.40659376978874207, "learning_rate": 0.0002, "epoch": 1.845349271572656, "step": 2470}, {"loss": 1.7626, "grad_norm": 0.39621320366859436, "learning_rate": 0.0002, "epoch": 1.8528203212551362, "step": 2480}, {"loss": 1.7427, "grad_norm": 0.3753979504108429, "learning_rate": 0.0002, "epoch": 1.8602913709376168, "step": 2490}, {"loss": 1.6622, "grad_norm": 0.3811938464641571, "learning_rate": 0.0002, "epoch": 1.8677624206200971, "step": 2500}, {"loss": 1.7718, "grad_norm": 0.3432596027851105, "learning_rate": 0.0002, "epoch": 1.8752334703025775, "step": 2510}, {"loss": 1.7488, "grad_norm": 0.3670712113380432, "learning_rate": 0.0002, "epoch": 1.882704519985058, "step": 2520}, {"loss": 1.705, "grad_norm": 0.40907177329063416, "learning_rate": 0.0002, "epoch": 1.8901755696675382, "step": 2530}, {"loss": 1.7148, "grad_norm": 0.3821999728679657, "learning_rate": 0.0002, "epoch": 1.8976466193500188, "step": 2540}, {"loss": 1.7934, "grad_norm": 0.36173978447914124, "learning_rate": 0.0002, "epoch": 1.905117669032499, "step": 2550}, {"loss": 1.6939, "grad_norm": 0.38990336656570435, "learning_rate": 0.0002, "epoch": 1.9125887187149795, "step": 2560}, {"loss": 1.6893, "grad_norm": 0.35242322087287903, "learning_rate": 0.0002, "epoch": 1.9200597683974598, "step": 2570}, {"loss": 1.7268, "grad_norm": 0.3506428003311157, "learning_rate": 0.0002, "epoch": 1.9275308180799402, "step": 2580}, {"loss": 1.6953, "grad_norm": 0.39540135860443115, "learning_rate": 0.0002, "epoch": 1.9350018677624208, "step": 2590}, {"loss": 1.6511, "grad_norm": 0.3444725573062897, "learning_rate": 0.0002, "epoch": 1.942472917444901, "step": 2600}, {"loss": 1.7259, "grad_norm": 0.3963521718978882, "learning_rate": 0.0002, "epoch": 1.9499439671273815, "step": 2610}, {"loss": 1.6946, "grad_norm": 0.3689815402030945, "learning_rate": 0.0002, "epoch": 1.9574150168098616, "step": 2620}, {"loss": 1.7384, "grad_norm": 0.3482626676559448, "learning_rate": 0.0002, "epoch": 1.9648860664923422, "step": 2630}, {"loss": 1.7048, "grad_norm": 0.35832616686820984, "learning_rate": 0.0002, "epoch": 1.9723571161748226, "step": 2640}, {"loss": 1.6681, "grad_norm": 0.4776208996772766, "learning_rate": 0.0002, "epoch": 1.979828165857303, "step": 2650}, {"loss": 1.6696, "grad_norm": 0.32570165395736694, "learning_rate": 0.0002, "epoch": 1.9872992155397835, "step": 2660}, {"loss": 1.7232, "grad_norm": 0.3380725085735321, "learning_rate": 0.0002, "epoch": 1.9947702652222636, "step": 2670}, {"eval_loss": 1.8046749830245972, "eval_runtime": 38.5096, "eval_samples_per_second": 13.373, "eval_steps_per_second": 1.688, "epoch": 2.0, "step": 2677}, {"loss": 1.7265, "grad_norm": 0.36817631125450134, "learning_rate": 0.0002, "epoch": 2.002241314904744, "step": 2680}, {"loss": 1.548, "grad_norm": 0.4056456685066223, "learning_rate": 0.0002, "epoch": 2.0097123645872244, "step": 2690}, {"loss": 1.5515, "grad_norm": 0.37416863441467285, "learning_rate": 0.0002, "epoch": 2.017183414269705, "step": 2700}, {"loss": 1.5895, "grad_norm": 0.4273638427257538, "learning_rate": 0.0002, "epoch": 2.024654463952185, "step": 2710}, {"loss": 1.5884, "grad_norm": 0.36497923731803894, "learning_rate": 0.0002, "epoch": 2.0321255136346656, "step": 2720}, {"loss": 1.6999, "grad_norm": 0.5021994113922119, "learning_rate": 0.0002, "epoch": 2.0395965633171462, "step": 2730}, {"loss": 1.6655, "grad_norm": 0.45896220207214355, "learning_rate": 0.0002, "epoch": 2.0470676129996264, "step": 2740}, {"loss": 1.6305, "grad_norm": 0.3973815143108368, "learning_rate": 0.0002, "epoch": 2.054538662682107, "step": 2750}, {"loss": 1.6301, "grad_norm": 0.4521815776824951, "learning_rate": 0.0002, "epoch": 2.062009712364587, "step": 2760}, {"loss": 1.6189, "grad_norm": 0.42775002121925354, "learning_rate": 0.0002, "epoch": 2.0694807620470677, "step": 2770}, {"loss": 1.6491, "grad_norm": 0.48158586025238037, "learning_rate": 0.0002, "epoch": 2.076951811729548, "step": 2780}, {"loss": 1.6301, "grad_norm": 0.4612371623516083, "learning_rate": 0.0002, "epoch": 2.0844228614120284, "step": 2790}, {"loss": 1.6327, "grad_norm": 0.42536866664886475, "learning_rate": 0.0002, "epoch": 2.091893911094509, "step": 2800}, {"loss": 1.651, "grad_norm": 0.48515772819519043, "learning_rate": 0.0002, "epoch": 2.099364960776989, "step": 2810}, {"loss": 1.6829, "grad_norm": 0.41418662667274475, "learning_rate": 0.0002, "epoch": 2.1068360104594697, "step": 2820}, {"loss": 1.6266, "grad_norm": 0.4683697819709778, "learning_rate": 0.0002, "epoch": 2.11430706014195, "step": 2830}, {"loss": 1.6586, "grad_norm": 0.4484657049179077, "learning_rate": 0.0002, "epoch": 2.1217781098244304, "step": 2840}, {"loss": 1.6483, "grad_norm": 0.6621400713920593, "learning_rate": 0.0002, "epoch": 2.1292491595069105, "step": 2850}, {"loss": 1.5755, "grad_norm": 0.45074811577796936, "learning_rate": 0.0002, "epoch": 2.136720209189391, "step": 2860}, {"loss": 1.6456, "grad_norm": 0.3513113558292389, "learning_rate": 0.0002, "epoch": 2.1441912588718717, "step": 2870}, {"loss": 1.6081, "grad_norm": 0.40411314368247986, "learning_rate": 0.0002, "epoch": 2.151662308554352, "step": 2880}, {"loss": 1.6323, "grad_norm": 0.4121065139770508, "learning_rate": 0.0002, "epoch": 2.1591333582368324, "step": 2890}, {"loss": 1.6324, "grad_norm": 0.44723689556121826, "learning_rate": 0.0002, "epoch": 2.1666044079193125, "step": 2900}, {"loss": 1.5699, "grad_norm": 0.4226122498512268, "learning_rate": 0.0002, "epoch": 2.174075457601793, "step": 2910}, {"loss": 1.5652, "grad_norm": 0.46617650985717773, "learning_rate": 0.0002, "epoch": 2.1815465072842732, "step": 2920}, {"loss": 1.6378, "grad_norm": 0.4506422281265259, "learning_rate": 0.0002, "epoch": 2.189017556966754, "step": 2930}, {"loss": 1.6112, "grad_norm": 0.4892672896385193, "learning_rate": 0.0002, "epoch": 2.1964886066492344, "step": 2940}, {"loss": 1.6176, "grad_norm": 0.44095516204833984, "learning_rate": 0.0002, "epoch": 2.2039596563317145, "step": 2950}, {"loss": 1.6058, "grad_norm": 0.41522109508514404, "learning_rate": 0.0002, "epoch": 2.211430706014195, "step": 2960}, {"loss": 1.5964, "grad_norm": 0.4860858917236328, "learning_rate": 0.0002, "epoch": 2.2189017556966752, "step": 2970}, {"loss": 1.6427, "grad_norm": 0.42662516236305237, "learning_rate": 0.0002, "epoch": 2.226372805379156, "step": 2980}, {"loss": 1.6313, "grad_norm": 0.4390648305416107, "learning_rate": 0.0002, "epoch": 2.233843855061636, "step": 2990}, {"loss": 1.5992, "grad_norm": 0.47515565156936646, "learning_rate": 0.0002, "epoch": 2.2413149047441165, "step": 3000}, {"loss": 1.5563, "grad_norm": 0.4104543924331665, "learning_rate": 0.0002, "epoch": 2.248785954426597, "step": 3010}, {"loss": 1.6895, "grad_norm": 0.4404028654098511, "learning_rate": 0.0002, "epoch": 2.2562570041090773, "step": 3020}, {"loss": 1.6088, "grad_norm": 0.4717366695404053, "learning_rate": 0.0002, "epoch": 2.263728053791558, "step": 3030}, {"loss": 1.7287, "grad_norm": 0.48345857858657837, "learning_rate": 0.0002, "epoch": 2.271199103474038, "step": 3040}, {"loss": 1.681, "grad_norm": 0.5312452912330627, "learning_rate": 0.0002, "epoch": 2.2786701531565186, "step": 3050}, {"loss": 1.5901, "grad_norm": 0.5073099732398987, "learning_rate": 0.0002, "epoch": 2.2861412028389987, "step": 3060}, {"loss": 1.6914, "grad_norm": 0.5027463436126709, "learning_rate": 0.0002, "epoch": 2.2936122525214793, "step": 3070}, {"loss": 1.5862, "grad_norm": 0.5436304807662964, "learning_rate": 0.0002, "epoch": 2.30108330220396, "step": 3080}, {"loss": 1.5763, "grad_norm": 0.4701065123081207, "learning_rate": 0.0002, "epoch": 2.30855435188644, "step": 3090}, {"loss": 1.6177, "grad_norm": 0.46988746523857117, "learning_rate": 0.0002, "epoch": 2.3160254015689206, "step": 3100}, {"loss": 1.6502, "grad_norm": 0.45112869143486023, "learning_rate": 0.0002, "epoch": 2.3234964512514007, "step": 3110}, {"loss": 1.6291, "grad_norm": 0.5173566937446594, "learning_rate": 0.0002, "epoch": 2.3309675009338813, "step": 3120}, {"loss": 1.6743, "grad_norm": 0.40345850586891174, "learning_rate": 0.0002, "epoch": 2.3384385506163614, "step": 3130}, {"loss": 1.621, "grad_norm": 0.4218924939632416, "learning_rate": 0.0002, "epoch": 2.345909600298842, "step": 3140}, {"loss": 1.6341, "grad_norm": 0.41857317090034485, "learning_rate": 0.0002, "epoch": 2.3533806499813226, "step": 3150}, {"loss": 1.6087, "grad_norm": 0.4197218418121338, "learning_rate": 0.0002, "epoch": 2.3608516996638027, "step": 3160}, {"loss": 1.6572, "grad_norm": 0.4260677397251129, "learning_rate": 0.0002, "epoch": 2.3683227493462833, "step": 3170}, {"loss": 1.6376, "grad_norm": 0.4209042191505432, "learning_rate": 0.0002, "epoch": 2.3757937990287634, "step": 3180}, {"loss": 1.634, "grad_norm": 0.4092234969139099, "learning_rate": 0.0002, "epoch": 2.383264848711244, "step": 3190}, {"loss": 1.6339, "grad_norm": 0.4928431510925293, "learning_rate": 0.0002, "epoch": 2.390735898393724, "step": 3200}, {"loss": 1.6015, "grad_norm": 0.49252402782440186, "learning_rate": 0.0002, "epoch": 2.3982069480762047, "step": 3210}, {"loss": 1.5773, "grad_norm": 0.4368397295475006, "learning_rate": 0.0002, "epoch": 2.4056779977586853, "step": 3220}, {"loss": 1.6629, "grad_norm": 0.46122390031814575, "learning_rate": 0.0002, "epoch": 2.4131490474411654, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4272301197052002, "learning_rate": 0.0002, "epoch": 2.420620097123646, "step": 3240}, {"loss": 1.5961, "grad_norm": 0.41480937600135803, "learning_rate": 0.0002, "epoch": 2.428091146806126, "step": 3250}, {"loss": 1.6281, "grad_norm": 0.48911941051483154, "learning_rate": 0.0002, "epoch": 2.4355621964886067, "step": 3260}, {"loss": 1.6846, "grad_norm": 0.4444098472595215, "learning_rate": 0.0002, "epoch": 2.443033246171087, "step": 3270}, {"loss": 1.6961, "grad_norm": 0.5111684799194336, "learning_rate": 0.0002, "epoch": 2.4505042958535674, "step": 3280}, {"loss": 1.6152, "grad_norm": 0.5058825016021729, "learning_rate": 0.0002, "epoch": 2.457975345536048, "step": 3290}, {"loss": 1.625, "grad_norm": 0.44173210859298706, "learning_rate": 0.0002, "epoch": 2.465446395218528, "step": 3300}, {"loss": 1.6491, "grad_norm": 0.4659745991230011, "learning_rate": 0.0002, "epoch": 2.4729174449010087, "step": 3310}, {"loss": 1.6114, "grad_norm": 0.47237497568130493, "learning_rate": 0.0002, "epoch": 2.480388494583489, "step": 3320}, {"loss": 1.6193, "grad_norm": 0.47303131222724915, "learning_rate": 0.0002, "epoch": 2.4878595442659694, "step": 3330}, {"loss": 1.7256, "grad_norm": 0.4522389769554138, "learning_rate": 0.0002, "epoch": 2.4953305939484496, "step": 3340}, {"loss": 1.6834, "grad_norm": 0.4467332363128662, "learning_rate": 0.0002, "epoch": 2.50280164363093, "step": 3350}, {"loss": 1.6108, "grad_norm": 0.4413762092590332, "learning_rate": 0.0002, "epoch": 2.5102726933134107, "step": 3360}, {"loss": 1.537, "grad_norm": 0.495514452457428, "learning_rate": 0.0002, "epoch": 2.517743742995891, "step": 3370}, {"loss": 1.5839, "grad_norm": 0.4429773986339569, "learning_rate": 0.0002, "epoch": 2.5252147926783715, "step": 3380}, {"loss": 1.6522, "grad_norm": 0.4589079022407532, "learning_rate": 0.0002, "epoch": 2.5326858423608516, "step": 3390}, {"loss": 1.6529, "grad_norm": 0.4683997333049774, "learning_rate": 0.0002, "epoch": 2.540156892043332, "step": 3400}, {"loss": 1.6745, "grad_norm": 0.4651731252670288, "learning_rate": 0.0002, "epoch": 2.5476279417258123, "step": 3410}, {"loss": 1.5918, "grad_norm": 0.45818084478378296, "learning_rate": 0.0002, "epoch": 2.555098991408293, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.45209529995918274, "learning_rate": 0.0002, "epoch": 2.5625700410907735, "step": 3430}, {"loss": 1.5606, "grad_norm": 0.4344733655452728, "learning_rate": 0.0002, "epoch": 2.5700410907732536, "step": 3440}, {"loss": 1.6748, "grad_norm": 0.47435566782951355, "learning_rate": 0.0002, "epoch": 2.577512140455734, "step": 3450}, {"loss": 1.6237, "grad_norm": 0.43841999769210815, "learning_rate": 0.0002, "epoch": 2.5849831901382143, "step": 3460}, {"loss": 1.7207, "grad_norm": 0.4323869049549103, "learning_rate": 0.0002, "epoch": 2.592454239820695, "step": 3470}, {"loss": 1.5494, "grad_norm": 0.44355881214141846, "learning_rate": 0.0002, "epoch": 2.599925289503175, "step": 3480}, {"loss": 1.665, "grad_norm": 0.45847779512405396, "learning_rate": 0.0002, "epoch": 2.6073963391856556, "step": 3490}, {"loss": 1.6006, "grad_norm": 0.4411061704158783, "learning_rate": 0.0002, "epoch": 2.614867388868136, "step": 3500}, {"loss": 1.5868, "grad_norm": 0.4446796178817749, "learning_rate": 0.0002, "epoch": 2.6223384385506163, "step": 3510}, {"loss": 1.5946, "grad_norm": 0.41969653964042664, "learning_rate": 0.0002, "epoch": 2.629809488233097, "step": 3520}, {"loss": 1.6798, "grad_norm": 0.5263747572898865, "learning_rate": 0.0002, "epoch": 2.637280537915577, "step": 3530}, {"loss": 1.6309, "grad_norm": 0.47719451785087585, "learning_rate": 0.0002, "epoch": 2.6447515875980576, "step": 3540}, {"loss": 1.7024, "grad_norm": 0.46574118733406067, "learning_rate": 0.0002, "epoch": 2.6522226372805378, "step": 3550}, {"loss": 1.618, "grad_norm": 0.46867135167121887, "learning_rate": 0.0002, "epoch": 2.6596936869630183, "step": 3560}, {"loss": 1.5885, "grad_norm": 0.4441198706626892, "learning_rate": 0.0002, "epoch": 2.667164736645499, "step": 3570}, {"loss": 1.6426, "grad_norm": 0.4871319830417633, "learning_rate": 0.0002, "epoch": 2.674635786327979, "step": 3580}, {"loss": 1.6575, "grad_norm": 0.43900373578071594, "learning_rate": 0.0002, "epoch": 2.6821068360104596, "step": 3590}, {"loss": 1.6071, "grad_norm": 0.42509549856185913, "learning_rate": 0.0002, "epoch": 2.6895778856929398, "step": 3600}, {"loss": 1.5651, "grad_norm": 0.4691086709499359, "learning_rate": 0.0002, "epoch": 2.6970489353754203, "step": 3610}, {"loss": 1.5491, "grad_norm": 0.46318942308425903, "learning_rate": 0.0002, "epoch": 2.7045199850579005, "step": 3620}, {"loss": 1.5422, "grad_norm": 0.44631096720695496, "learning_rate": 0.0002, "epoch": 2.711991034740381, "step": 3630}, {"loss": 1.6831, "grad_norm": 0.42315489053726196, "learning_rate": 0.0002, "epoch": 2.7194620844228616, "step": 3640}, {"loss": 1.6008, "grad_norm": 0.4971241056919098, "learning_rate": 0.0002, "epoch": 2.7269331341053418, "step": 3650}, {"loss": 1.6042, "grad_norm": 0.4578486382961273, "learning_rate": 0.0002, "epoch": 2.7344041837878224, "step": 3660}, {"loss": 1.6076, "grad_norm": 0.46584776043891907, "learning_rate": 0.0002, "epoch": 2.7418752334703025, "step": 3670}, {"loss": 1.6809, "grad_norm": 0.4951731264591217, "learning_rate": 0.0002, "epoch": 2.749346283152783, "step": 3680}, {"loss": 1.6226, "grad_norm": 0.4935225546360016, "learning_rate": 0.0002, "epoch": 2.756817332835263, "step": 3690}, {"loss": 1.5878, "grad_norm": 0.41805586218833923, "learning_rate": 0.0002, "epoch": 2.764288382517744, "step": 3700}, {"loss": 1.7173, "grad_norm": 0.4417555630207062, "learning_rate": 0.0002, "epoch": 2.7717594322002244, "step": 3710}, {"loss": 1.6398, "grad_norm": 0.48229655623435974, "learning_rate": 0.0002, "epoch": 2.7792304818827045, "step": 3720}, {"loss": 1.6074, "grad_norm": 0.48562315106391907, "learning_rate": 0.0002, "epoch": 2.786701531565185, "step": 3730}, {"loss": 1.607, "grad_norm": 0.4473940432071686, "learning_rate": 0.0002, "epoch": 2.794172581247665, "step": 3740}, {"loss": 1.6065, "grad_norm": 0.4626813232898712, "learning_rate": 0.0002, "epoch": 2.801643630930146, "step": 3750}, {"loss": 1.6296, "grad_norm": 0.4339792728424072, "learning_rate": 0.0002, "epoch": 2.809114680612626, "step": 3760}, {"loss": 1.6815, "grad_norm": 0.5250858068466187, "learning_rate": 0.0002, "epoch": 2.8165857302951065, "step": 3770}, {"loss": 1.6644, "grad_norm": 0.4537523090839386, "learning_rate": 0.0002, "epoch": 2.824056779977587, "step": 3780}, {"loss": 1.6535, "grad_norm": 0.5646113157272339, "learning_rate": 0.0002, "epoch": 2.831527829660067, "step": 3790}, {"loss": 1.5712, "grad_norm": 0.44243332743644714, "learning_rate": 0.0002, "epoch": 2.8389988793425474, "step": 3800}, {"loss": 1.6478, "grad_norm": 0.4585791826248169, "learning_rate": 0.0002, "epoch": 2.846469929025028, "step": 3810}, {"loss": 1.6854, "grad_norm": 0.489702045917511, "learning_rate": 0.0002, "epoch": 2.8539409787075085, "step": 3820}, {"loss": 1.7066, "grad_norm": 0.502470850944519, "learning_rate": 0.0002, "epoch": 2.8614120283899886, "step": 3830}, {"loss": 1.5785, "grad_norm": 0.4395960867404938, "learning_rate": 0.0002, "epoch": 2.8688830780724692, "step": 3840}, {"loss": 1.6434, "grad_norm": 0.4348670244216919, "learning_rate": 0.0002, "epoch": 2.87635412775495, "step": 3850}, {"loss": 1.6163, "grad_norm": 0.48852720856666565, "learning_rate": 0.0002, "epoch": 2.88382517743743, "step": 3860}, {"loss": 1.5916, "grad_norm": 0.45317450165748596, "learning_rate": 0.0002, "epoch": 2.89129622711991, "step": 3870}, {"loss": 1.6486, "grad_norm": 0.4732758700847626, "learning_rate": 0.0002, "epoch": 2.8987672768023907, "step": 3880}, {"loss": 1.6758, "grad_norm": 0.45238012075424194, "learning_rate": 0.0002, "epoch": 2.9062383264848712, "step": 3890}, {"loss": 1.6228, "grad_norm": 0.48838064074516296, "learning_rate": 0.0002, "epoch": 2.9137093761673514, "step": 3900}, {"loss": 1.658, "grad_norm": 0.43496349453926086, "learning_rate": 0.0002, "epoch": 2.921180425849832, "step": 3910}, {"loss": 1.7063, "grad_norm": 0.47963935136795044, "learning_rate": 0.0002, "epoch": 2.9286514755323125, "step": 3920}, {"loss": 1.6553, "grad_norm": 0.4544987976551056, "learning_rate": 0.0002, "epoch": 2.9361225252147927, "step": 3930}, {"loss": 1.6192, "grad_norm": 0.4622892141342163, "learning_rate": 0.0002, "epoch": 2.943593574897273, "step": 3940}, {"loss": 1.6178, "grad_norm": 0.47026222944259644, "learning_rate": 0.0002, "epoch": 2.9510646245797534, "step": 3950}, {"loss": 1.6612, "grad_norm": 0.4549552798271179, "learning_rate": 0.0002, "epoch": 2.958535674262234, "step": 3960}, {"loss": 1.6458, "grad_norm": 0.46647515892982483, "learning_rate": 0.0002, "epoch": 2.966006723944714, "step": 3970}, {"loss": 1.6051, "grad_norm": 0.45095112919807434, "learning_rate": 0.0002, "epoch": 2.9734777736271947, "step": 3980}, {"loss": 1.6471, "grad_norm": 0.4690017104148865, "learning_rate": 0.0002, "epoch": 2.9809488233096753, "step": 3990}, {"loss": 1.6061, "grad_norm": 0.4603444039821625, "learning_rate": 0.0002, "epoch": 2.9884198729921554, "step": 4000}, {"loss": 1.6431, "grad_norm": 0.4743294417858124, "learning_rate": 0.0002, "epoch": 2.9958909226746355, "step": 4010}, {"eval_loss": 1.8252571821212769, "eval_runtime": 38.7853, "eval_samples_per_second": 13.278, "eval_steps_per_second": 1.676, "epoch": 2.999626447515876, "step": 4015}, {"loss": 1.6512, "grad_norm": 0.4919724464416504, "learning_rate": 0.0002, "epoch": 3.003361972357116, "step": 4020}, {"loss": 1.5354, "grad_norm": 0.4747185707092285, "learning_rate": 0.0002, "epoch": 3.0108330220395967, "step": 4030}, {"loss": 1.568, "grad_norm": 0.4797595143318176, "learning_rate": 0.0002, "epoch": 3.018304071722077, "step": 4040}, {"loss": 1.5194, "grad_norm": 0.5450999140739441, "learning_rate": 0.0002, "epoch": 3.0257751214045574, "step": 4050}, {"loss": 1.5065, "grad_norm": 0.49058812856674194, "learning_rate": 0.0002, "epoch": 3.0332461710870375, "step": 4060}, {"loss": 1.4884, "grad_norm": 0.5219563841819763, "learning_rate": 0.0002, "epoch": 3.040717220769518, "step": 4070}, {"loss": 1.4742, "grad_norm": 0.515628457069397, "learning_rate": 0.0002, "epoch": 3.0481882704519987, "step": 4080}, {"loss": 1.5313, "grad_norm": 0.6145984530448914, "learning_rate": 0.0002, "epoch": 3.055659320134479, "step": 4090}, {"loss": 1.4989, "grad_norm": 0.6067144274711609, "learning_rate": 0.0002, "epoch": 3.0631303698169594, "step": 4100}, {"loss": 1.528, "grad_norm": 0.5773133039474487, "learning_rate": 0.0002, "epoch": 3.0706014194994395, "step": 4110}, {"loss": 1.5374, "grad_norm": 0.6894241571426392, "learning_rate": 0.0002, "epoch": 3.07807246918192, "step": 4120}, {"loss": 1.5422, "grad_norm": 0.6422514915466309, "learning_rate": 0.0002, "epoch": 3.0855435188644003, "step": 4130}, {"loss": 1.4724, "grad_norm": 0.6119855046272278, "learning_rate": 0.0002, "epoch": 3.093014568546881, "step": 4140}, {"loss": 1.5361, "grad_norm": 0.5847280025482178, "learning_rate": 0.0002, "epoch": 3.1004856182293614, "step": 4150}, {"loss": 1.5151, "grad_norm": 0.5401515960693359, "learning_rate": 0.0002, "epoch": 3.1079566679118416, "step": 4160}, {"loss": 1.502, "grad_norm": 0.6501587629318237, "learning_rate": 0.0002, "epoch": 3.115427717594322, "step": 4170}, {"loss": 1.4952, "grad_norm": 0.5988039374351501, "learning_rate": 0.0002, "epoch": 3.1228987672768023, "step": 4180}, {"loss": 1.5287, "grad_norm": 0.4982665181159973, "learning_rate": 0.0002, "epoch": 3.130369816959283, "step": 4190}, {"loss": 1.5078, "grad_norm": 0.5548039078712463, "learning_rate": 0.0002, "epoch": 3.137840866641763, "step": 4200}, {"loss": 1.4904, "grad_norm": 0.5920777320861816, "learning_rate": 0.0002, "epoch": 3.1453119163242436, "step": 4210}, {"loss": 1.442, "grad_norm": 0.6965190172195435, "learning_rate": 0.0002, "epoch": 3.152782966006724, "step": 4220}, {"loss": 1.557, "grad_norm": 0.5196244716644287, "learning_rate": 0.0002, "epoch": 3.1602540156892043, "step": 4230}, {"loss": 1.5706, "grad_norm": 0.6942682266235352, "learning_rate": 0.0002, "epoch": 3.167725065371685, "step": 4240}, {"loss": 1.5407, "grad_norm": 0.5765156149864197, "learning_rate": 0.0002, "epoch": 3.175196115054165, "step": 4250}, {"loss": 1.4963, "grad_norm": 0.5801976919174194, "learning_rate": 0.0002, "epoch": 3.1826671647366456, "step": 4260}, {"loss": 1.4988, "grad_norm": 0.6260752081871033, "learning_rate": 0.0002, "epoch": 3.1901382144191257, "step": 4270}, {"loss": 1.5074, "grad_norm": 0.6610770225524902, "learning_rate": 0.0002, "epoch": 3.1976092641016063, "step": 4280}, {"loss": 1.4657, "grad_norm": 0.5762143135070801, "learning_rate": 0.0002, "epoch": 3.205080313784087, "step": 4290}, {"loss": 1.5181, "grad_norm": 0.5926990509033203, "learning_rate": 0.0002, "epoch": 3.212551363466567, "step": 4300}, {"loss": 1.5492, "grad_norm": 0.7373854517936707, "learning_rate": 0.0002, "epoch": 3.2200224131490476, "step": 4310}, {"loss": 1.4648, "grad_norm": 0.5963311195373535, "learning_rate": 0.0002, "epoch": 3.2274934628315277, "step": 4320}, {"loss": 1.5262, "grad_norm": 0.5754616856575012, "learning_rate": 0.0002, "epoch": 3.2349645125140083, "step": 4330}, {"loss": 1.4767, "grad_norm": 0.6116095781326294, "learning_rate": 0.0002, "epoch": 3.2424355621964884, "step": 4340}, {"loss": 1.5008, "grad_norm": 0.6001536846160889, "learning_rate": 0.0002, "epoch": 3.249906611878969, "step": 4350}, {"loss": 1.5738, "grad_norm": 0.5270227789878845, "learning_rate": 0.0002, "epoch": 3.257377661561449, "step": 4360}, {"loss": 1.5235, "grad_norm": 0.6666602492332458, "learning_rate": 0.0002, "epoch": 3.2648487112439297, "step": 4370}, {"loss": 1.5665, "grad_norm": 0.520310640335083, "learning_rate": 0.0002, "epoch": 3.2723197609264103, "step": 4380}, {"loss": 1.542, "grad_norm": 0.5165975093841553, "learning_rate": 0.0002, "epoch": 3.2797908106088904, "step": 4390}, {"loss": 1.4746, "grad_norm": 0.6080228686332703, "learning_rate": 0.0002, "epoch": 3.287261860291371, "step": 4400}, {"loss": 1.4901, "grad_norm": 0.670122504234314, "learning_rate": 0.0002, "epoch": 3.294732909973851, "step": 4410}, {"loss": 1.4677, "grad_norm": 0.6019457578659058, "learning_rate": 0.0002, "epoch": 3.3022039596563317, "step": 4420}, {"loss": 1.4249, "grad_norm": 0.5519300103187561, "learning_rate": 0.0002, "epoch": 3.309675009338812, "step": 4430}, {"loss": 1.555, "grad_norm": 0.5958521962165833, "learning_rate": 0.0002, "epoch": 3.3171460590212924, "step": 4440}, {"loss": 1.5067, "grad_norm": 0.5552705526351929, "learning_rate": 0.0002, "epoch": 3.324617108703773, "step": 4450}, {"loss": 1.5926, "grad_norm": 0.6583784818649292, "learning_rate": 0.0002, "epoch": 3.332088158386253, "step": 4460}, {"loss": 1.4206, "grad_norm": 0.5815939903259277, "learning_rate": 0.0002, "epoch": 3.3395592080687337, "step": 4470}, {"loss": 1.5942, "grad_norm": 1.3342205286026, "learning_rate": 0.0002, "epoch": 3.347030257751214, "step": 4480}, {"loss": 1.484, "grad_norm": 0.6341500878334045, "learning_rate": 0.0002, "epoch": 3.3545013074336945, "step": 4490}, {"loss": 1.5219, "grad_norm": 0.6384079456329346, "learning_rate": 0.0002, "epoch": 3.3619723571161746, "step": 4500}, {"loss": 1.5222, "grad_norm": 0.6098346710205078, "learning_rate": 0.0002, "epoch": 3.369443406798655, "step": 4510}, {"loss": 1.5475, "grad_norm": 0.5958296656608582, "learning_rate": 0.0002, "epoch": 3.3769144564811358, "step": 4520}, {"loss": 1.5171, "grad_norm": 0.6157881617546082, "learning_rate": 0.0002, "epoch": 3.384385506163616, "step": 4530}, {"loss": 1.569, "grad_norm": 0.5671007037162781, "learning_rate": 0.0002, "epoch": 3.3918565558460965, "step": 4540}, {"loss": 1.604, "grad_norm": 0.6203294992446899, "learning_rate": 0.0002, "epoch": 3.3993276055285766, "step": 4550}, {"loss": 1.5364, "grad_norm": 0.6743317246437073, "learning_rate": 0.0002, "epoch": 3.406798655211057, "step": 4560}, {"loss": 1.5034, "grad_norm": 0.731765627861023, "learning_rate": 0.0002, "epoch": 3.4142697048935373, "step": 4570}, {"loss": 1.4585, "grad_norm": 0.6285187602043152, "learning_rate": 0.0002, "epoch": 3.421740754576018, "step": 4580}, {"loss": 1.5296, "grad_norm": 0.612680196762085, "learning_rate": 0.0002, "epoch": 3.4292118042584985, "step": 4590}, {"loss": 1.5577, "grad_norm": 0.6413681507110596, "learning_rate": 0.0002, "epoch": 3.4366828539409786, "step": 4600}, {"loss": 1.5026, "grad_norm": 0.6240990161895752, "learning_rate": 0.0002, "epoch": 3.444153903623459, "step": 4610}, {"loss": 1.5887, "grad_norm": 0.5095735192298889, "learning_rate": 0.0002, "epoch": 3.4516249533059393, "step": 4620}, {"loss": 1.4906, "grad_norm": 0.5699611902236938, "learning_rate": 0.0002, "epoch": 3.45909600298842, "step": 4630}, {"loss": 1.5176, "grad_norm": 0.7289775609970093, "learning_rate": 0.0002, "epoch": 3.4665670526709, "step": 4640}, {"loss": 1.5467, "grad_norm": 0.6211609840393066, "learning_rate": 0.0002, "epoch": 3.4740381023533806, "step": 4650}, {"loss": 1.533, "grad_norm": 0.5714802145957947, "learning_rate": 0.0002, "epoch": 3.481509152035861, "step": 4660}, {"loss": 1.5096, "grad_norm": 0.6287049651145935, "learning_rate": 0.0002, "epoch": 3.4889802017183413, "step": 4670}, {"loss": 1.4212, "grad_norm": 0.5480595827102661, "learning_rate": 0.0002, "epoch": 3.496451251400822, "step": 4680}, {"loss": 1.4746, "grad_norm": 0.5683253407478333, "learning_rate": 0.0002, "epoch": 3.503922301083302, "step": 4690}, {"loss": 1.5012, "grad_norm": 0.601140558719635, "learning_rate": 0.0002, "epoch": 3.5113933507657826, "step": 4700}, {"loss": 1.5383, "grad_norm": 0.5344498157501221, "learning_rate": 0.0002, "epoch": 3.5188644004482628, "step": 4710}, {"loss": 1.5428, "grad_norm": 0.5739690661430359, "learning_rate": 0.0002, "epoch": 3.5263354501307433, "step": 4720}, {"loss": 1.5589, "grad_norm": 0.5640085935592651, "learning_rate": 0.0002, "epoch": 3.533806499813224, "step": 4730}, {"loss": 1.487, "grad_norm": 0.5967805981636047, "learning_rate": 0.0002, "epoch": 3.541277549495704, "step": 4740}, {"loss": 1.5461, "grad_norm": 0.6138835549354553, "learning_rate": 0.0002, "epoch": 3.5487485991781846, "step": 4750}, {"loss": 1.5502, "grad_norm": 0.6779900193214417, "learning_rate": 0.0002, "epoch": 3.5562196488606648, "step": 4760}, {"loss": 1.4917, "grad_norm": 0.6122010350227356, "learning_rate": 0.0002, "epoch": 3.5636906985431454, "step": 4770}, {"loss": 1.5405, "grad_norm": 0.5685241222381592, "learning_rate": 0.0002, "epoch": 3.5711617482256255, "step": 4780}, {"loss": 1.5427, "grad_norm": 0.604583203792572, "learning_rate": 0.0002, "epoch": 3.578632797908106, "step": 4790}, {"loss": 1.4514, "grad_norm": 0.651165246963501, "learning_rate": 0.0002, "epoch": 3.5861038475905866, "step": 4800}, {"loss": 1.4109, "grad_norm": 0.6398511528968811, "learning_rate": 0.0002, "epoch": 3.593574897273067, "step": 4810}, {"loss": 1.4261, "grad_norm": 0.6444641351699829, "learning_rate": 0.0002, "epoch": 3.6010459469555474, "step": 4820}, {"loss": 1.5274, "grad_norm": 0.6018481850624084, "learning_rate": 0.0002, "epoch": 3.6085169966380275, "step": 4830}, {"loss": 1.4647, "grad_norm": 0.6025291085243225, "learning_rate": 0.0002, "epoch": 3.615988046320508, "step": 4840}, {"loss": 1.5609, "grad_norm": 0.6810156106948853, "learning_rate": 0.0002, "epoch": 3.623459096002988, "step": 4850}, {"loss": 1.5299, "grad_norm": 0.6408044695854187, "learning_rate": 0.0002, "epoch": 3.630930145685469, "step": 4860}, {"loss": 1.5366, "grad_norm": 0.5608272552490234, "learning_rate": 0.0002, "epoch": 3.6384011953679494, "step": 4870}, {"loss": 1.5188, "grad_norm": 0.6136814951896667, "learning_rate": 0.0002, "epoch": 3.6458722450504295, "step": 4880}, {"loss": 1.5021, "grad_norm": 0.5927900075912476, "learning_rate": 0.0002, "epoch": 3.65334329473291, "step": 4890}, {"loss": 1.6084, "grad_norm": 0.5336901545524597, "learning_rate": 0.0002, "epoch": 3.66081434441539, "step": 4900}, {"loss": 1.5701, "grad_norm": 0.7823320627212524, "learning_rate": 0.0002, "epoch": 3.668285394097871, "step": 4910}, {"loss": 1.4881, "grad_norm": 0.6703504323959351, "learning_rate": 0.0002, "epoch": 3.675756443780351, "step": 4920}, {"loss": 1.5332, "grad_norm": 0.6061160564422607, "learning_rate": 0.0002, "epoch": 3.6832274934628315, "step": 4930}, {"loss": 1.5405, "grad_norm": 0.6237227916717529, "learning_rate": 0.0002, "epoch": 3.690698543145312, "step": 4940}, {"loss": 1.497, "grad_norm": 0.5985278487205505, "learning_rate": 0.0002, "epoch": 3.6981695928277922, "step": 4950}, {"loss": 1.5132, "grad_norm": 0.6483839750289917, "learning_rate": 0.0002, "epoch": 3.705640642510273, "step": 4960}, {"loss": 1.5338, "grad_norm": 0.5788805484771729, "learning_rate": 0.0002, "epoch": 3.713111692192753, "step": 4970}, {"loss": 1.5258, "grad_norm": 0.5609974265098572, "learning_rate": 0.0002, "epoch": 3.7205827418752335, "step": 4980}, {"loss": 1.4759, "grad_norm": 0.5681300759315491, "learning_rate": 0.0002, "epoch": 3.7280537915577137, "step": 4990}, {"loss": 1.6018, "grad_norm": 0.5860186219215393, "learning_rate": 0.0002, "epoch": 3.7355248412401942, "step": 5000}, {"loss": 1.58, "grad_norm": 0.5718157291412354, "learning_rate": 0.0002, "epoch": 3.742995890922675, "step": 5010}, {"loss": 1.5834, "grad_norm": 0.6173721551895142, "learning_rate": 0.0002, "epoch": 3.750466940605155, "step": 5020}, {"loss": 1.5617, "grad_norm": 0.629152238368988, "learning_rate": 0.0002, "epoch": 3.7579379902876355, "step": 5030}, {"loss": 1.519, "grad_norm": 0.5666284561157227, "learning_rate": 0.0002, "epoch": 3.7654090399701157, "step": 5040}, {"loss": 1.5329, "grad_norm": 0.6053005456924438, "learning_rate": 0.0002, "epoch": 3.7728800896525962, "step": 5050}, {"loss": 1.5404, "grad_norm": 0.5870583057403564, "learning_rate": 0.0002, "epoch": 3.7803511393350764, "step": 5060}, {"loss": 1.4444, "grad_norm": 0.5422009229660034, "learning_rate": 0.0002, "epoch": 3.787822189017557, "step": 5070}, {"loss": 1.5308, "grad_norm": 0.5396918058395386, "learning_rate": 0.0002, "epoch": 3.7952932387000375, "step": 5080}, {"loss": 1.464, "grad_norm": 0.5544713139533997, "learning_rate": 0.0002, "epoch": 3.8027642883825177, "step": 5090}, {"loss": 1.4752, "grad_norm": 0.5983749628067017, "learning_rate": 0.0002, "epoch": 3.8102353380649983, "step": 5100}, {"loss": 1.4972, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 3.8177063877474784, "step": 5110}, {"loss": 1.5471, "grad_norm": 0.5436882376670837, "learning_rate": 0.0002, "epoch": 3.825177437429959, "step": 5120}, {"loss": 1.5118, "grad_norm": 0.5453617572784424, "learning_rate": 0.0002, "epoch": 3.832648487112439, "step": 5130}, {"loss": 1.5732, "grad_norm": 0.6269069314002991, "learning_rate": 0.0002, "epoch": 3.8401195367949197, "step": 5140}, {"loss": 1.4959, "grad_norm": 0.6189185380935669, "learning_rate": 0.0002, "epoch": 3.8475905864774003, "step": 5150}, {"loss": 1.4999, "grad_norm": 0.6653388142585754, "learning_rate": 0.0002, "epoch": 3.8550616361598804, "step": 5160}, {"loss": 1.5075, "grad_norm": 0.5771768689155579, "learning_rate": 0.0002, "epoch": 3.862532685842361, "step": 5170}, {"loss": 1.5545, "grad_norm": 0.6052790880203247, "learning_rate": 0.0002, "epoch": 3.870003735524841, "step": 5180}, {"loss": 1.4987, "grad_norm": 0.6572316884994507, "learning_rate": 0.0002, "epoch": 3.8774747852073217, "step": 5190}, {"loss": 1.5241, "grad_norm": 0.670576810836792, "learning_rate": 0.0002, "epoch": 3.884945834889802, "step": 5200}, {"loss": 1.4777, "grad_norm": 0.5728798508644104, "learning_rate": 0.0002, "epoch": 3.8924168845722824, "step": 5210}, {"loss": 1.5351, "grad_norm": 0.6340774297714233, "learning_rate": 0.0002, "epoch": 3.899887934254763, "step": 5220}, {"loss": 1.5081, "grad_norm": 0.5981315970420837, "learning_rate": 0.0002, "epoch": 3.907358983937243, "step": 5230}, {"loss": 1.4875, "grad_norm": 0.6212025880813599, "learning_rate": 0.0002, "epoch": 3.9148300336197237, "step": 5240}, {"loss": 1.5545, "grad_norm": 0.6202296018600464, "learning_rate": 0.0002, "epoch": 3.922301083302204, "step": 5250}, {"loss": 1.5765, "grad_norm": 0.6159142255783081, "learning_rate": 0.0002, "epoch": 3.9297721329846844, "step": 5260}, {"loss": 1.4938, "grad_norm": 0.6519438624382019, "learning_rate": 0.0002, "epoch": 3.9372431826671646, "step": 5270}, {"loss": 1.4859, "grad_norm": 0.539813756942749, "learning_rate": 0.0002, "epoch": 3.944714232349645, "step": 5280}, {"loss": 1.5921, "grad_norm": 0.6443665027618408, "learning_rate": 0.0002, "epoch": 3.9521852820321257, "step": 5290}, {"loss": 1.5153, "grad_norm": 0.6635757684707642, "learning_rate": 0.0002, "epoch": 3.959656331714606, "step": 5300}, {"loss": 1.5485, "grad_norm": 0.589363157749176, "learning_rate": 0.0002, "epoch": 3.9671273813970864, "step": 5310}, {"loss": 1.5498, "grad_norm": 0.5788735747337341, "learning_rate": 0.0002, "epoch": 3.9745984310795666, "step": 5320}, {"loss": 1.5607, "grad_norm": 0.5976864695549011, "learning_rate": 0.0002, "epoch": 3.982069480762047, "step": 5330}, {"loss": 1.5302, "grad_norm": 0.6624067425727844, "learning_rate": 0.0002, "epoch": 3.9895405304445273, "step": 5340}, {"loss": 1.5904, "grad_norm": 0.6738956570625305, "learning_rate": 0.0002, "epoch": 3.997011580127008, "step": 5350}, {"eval_loss": 1.868006944656372, "eval_runtime": 38.5153, "eval_samples_per_second": 13.371, "eval_steps_per_second": 1.688, "epoch": 4.0, "step": 5354}, {"loss": 1.4535, "grad_norm": 0.6023468971252441, "learning_rate": 0.0002, "epoch": 4.004482629809488, "step": 5360}, {"loss": 1.3987, "grad_norm": 0.8589285612106323, "learning_rate": 0.0002, "epoch": 4.011953679491969, "step": 5370}, {"loss": 1.3952, "grad_norm": 0.7477491497993469, "learning_rate": 0.0002, "epoch": 4.019424729174449, "step": 5380}, {"loss": 1.3745, "grad_norm": 0.7601922154426575, "learning_rate": 0.0002, "epoch": 4.02689577885693, "step": 5390}, {"loss": 1.4133, "grad_norm": 0.8115614056587219, "learning_rate": 0.0002, "epoch": 4.03436682853941, "step": 5400}, {"loss": 1.3748, "grad_norm": 0.669925332069397, "learning_rate": 0.0002, "epoch": 4.04183787822189, "step": 5410}, {"loss": 1.2835, "grad_norm": 0.8091904520988464, "learning_rate": 0.0002, "epoch": 4.04930892790437, "step": 5420}, {"loss": 1.3615, "grad_norm": 0.709405779838562, "learning_rate": 0.0002, "epoch": 4.056779977586851, "step": 5430}, {"loss": 1.3558, "grad_norm": 1.0006179809570312, "learning_rate": 0.0002, "epoch": 4.064251027269331, "step": 5440}, {"loss": 1.3491, "grad_norm": 0.7017965912818909, "learning_rate": 0.0002, "epoch": 4.071722076951811, "step": 5450}, {"loss": 1.3642, "grad_norm": 0.8991572260856628, "learning_rate": 0.0002, "epoch": 4.0791931266342925, "step": 5460}, {"loss": 1.392, "grad_norm": 0.9064797759056091, "learning_rate": 0.0002, "epoch": 4.086664176316773, "step": 5470}, {"loss": 1.3425, "grad_norm": 0.7981749176979065, "learning_rate": 0.0002, "epoch": 4.094135225999253, "step": 5480}, {"loss": 1.3826, "grad_norm": 0.7280883193016052, "learning_rate": 0.0002, "epoch": 4.101606275681733, "step": 5490}, {"loss": 1.3275, "grad_norm": 0.7419600486755371, "learning_rate": 0.0002, "epoch": 4.109077325364214, "step": 5500}, {"loss": 1.3199, "grad_norm": 0.8019949197769165, "learning_rate": 0.0002, "epoch": 4.116548375046694, "step": 5510}, {"loss": 1.3133, "grad_norm": 0.7501229047775269, "learning_rate": 0.0002, "epoch": 4.124019424729174, "step": 5520}, {"loss": 1.4432, "grad_norm": 0.8166249990463257, "learning_rate": 0.0002, "epoch": 4.131490474411655, "step": 5530}, {"loss": 1.3901, "grad_norm": 0.9728496074676514, "learning_rate": 0.0002, "epoch": 4.138961524094135, "step": 5540}, {"loss": 1.3538, "grad_norm": 0.7590922117233276, "learning_rate": 0.0002, "epoch": 4.1464325737766154, "step": 5550}, {"loss": 1.4368, "grad_norm": 0.7759010791778564, "learning_rate": 0.0002, "epoch": 4.153903623459096, "step": 5560}, {"loss": 1.3635, "grad_norm": 0.9057986736297607, "learning_rate": 0.0002, "epoch": 4.161374673141577, "step": 5570}, {"loss": 1.4152, "grad_norm": 0.8853937983512878, "learning_rate": 0.0002, "epoch": 4.168845722824057, "step": 5580}, {"loss": 1.3633, "grad_norm": 0.7070684432983398, "learning_rate": 0.0002, "epoch": 4.176316772506537, "step": 5590}, {"loss": 1.3218, "grad_norm": 0.7649410963058472, "learning_rate": 0.0002, "epoch": 4.183787822189018, "step": 5600}, {"loss": 1.3857, "grad_norm": 1.2048029899597168, "learning_rate": 0.0002, "epoch": 4.191258871871498, "step": 5610}, {"loss": 1.3629, "grad_norm": 0.7986605763435364, "learning_rate": 0.0002, "epoch": 4.198729921553978, "step": 5620}, {"loss": 1.3995, "grad_norm": 0.8151885867118835, "learning_rate": 0.0002, "epoch": 4.206200971236458, "step": 5630}, {"loss": 1.3782, "grad_norm": 0.7719064354896545, "learning_rate": 0.0002, "epoch": 4.213672020918939, "step": 5640}, {"loss": 1.3852, "grad_norm": 0.8422448039054871, "learning_rate": 0.0002, "epoch": 4.2211430706014195, "step": 5650}, {"loss": 1.3321, "grad_norm": 0.7017164826393127, "learning_rate": 0.0002, "epoch": 4.2286141202839, "step": 5660}, {"loss": 1.4105, "grad_norm": 0.8559677600860596, "learning_rate": 0.0002, "epoch": 4.236085169966381, "step": 5670}, {"loss": 1.3701, "grad_norm": 0.8216157555580139, "learning_rate": 0.0002, "epoch": 4.243556219648861, "step": 5680}, {"loss": 1.3565, "grad_norm": 0.7681755423545837, "learning_rate": 0.0002, "epoch": 4.251027269331341, "step": 5690}, {"loss": 1.3806, "grad_norm": 0.811665952205658, "learning_rate": 0.0002, "epoch": 4.258498319013821, "step": 5700}, {"loss": 1.4161, "grad_norm": 0.7242204546928406, "learning_rate": 0.0002, "epoch": 4.265969368696302, "step": 5710}, {"loss": 1.2958, "grad_norm": 0.7570181488990784, "learning_rate": 0.0002, "epoch": 4.273440418378782, "step": 5720}, {"loss": 1.4265, "grad_norm": 0.8951969146728516, "learning_rate": 0.0002, "epoch": 4.280911468061262, "step": 5730}, {"loss": 1.3895, "grad_norm": 0.7222902178764343, "learning_rate": 0.0002, "epoch": 4.288382517743743, "step": 5740}, {"loss": 1.4155, "grad_norm": 0.8508469462394714, "learning_rate": 0.0002, "epoch": 4.2958535674262235, "step": 5750}, {"loss": 1.365, "grad_norm": 0.7215430736541748, "learning_rate": 0.0002, "epoch": 4.303324617108704, "step": 5760}, {"loss": 1.4472, "grad_norm": 0.8774884939193726, "learning_rate": 0.0002, "epoch": 4.310795666791184, "step": 5770}, {"loss": 1.427, "grad_norm": 0.8354552984237671, "learning_rate": 0.0002, "epoch": 4.318266716473665, "step": 5780}, {"loss": 1.3222, "grad_norm": 0.6938814520835876, "learning_rate": 0.0002, "epoch": 4.325737766156145, "step": 5790}, {"loss": 1.3589, "grad_norm": 0.78675377368927, "learning_rate": 0.0002, "epoch": 4.333208815838625, "step": 5800}, {"loss": 1.3662, "grad_norm": 0.7147697806358337, "learning_rate": 0.0002, "epoch": 4.340679865521106, "step": 5810}, {"loss": 1.3597, "grad_norm": 0.7693623304367065, "learning_rate": 0.0002, "epoch": 4.348150915203586, "step": 5820}, {"loss": 1.2944, "grad_norm": 0.856517493724823, "learning_rate": 0.0002, "epoch": 4.355621964886066, "step": 5830}, {"loss": 1.4307, "grad_norm": 0.7200973033905029, "learning_rate": 0.0002, "epoch": 4.3630930145685465, "step": 5840}, {"loss": 1.442, "grad_norm": 0.743281364440918, "learning_rate": 0.0002, "epoch": 4.3705640642510275, "step": 5850}, {"loss": 1.3999, "grad_norm": 0.7627727389335632, "learning_rate": 0.0002, "epoch": 4.378035113933508, "step": 5860}, {"loss": 1.4082, "grad_norm": 0.7238836884498596, "learning_rate": 0.0002, "epoch": 4.385506163615988, "step": 5870}, {"loss": 1.4292, "grad_norm": 0.7253410816192627, "learning_rate": 0.0002, "epoch": 4.392977213298469, "step": 5880}, {"loss": 1.3774, "grad_norm": 0.8232238292694092, "learning_rate": 0.0002, "epoch": 4.400448262980949, "step": 5890}, {"loss": 1.3757, "grad_norm": 0.8778504729270935, "learning_rate": 0.0002, "epoch": 4.407919312663429, "step": 5900}, {"loss": 1.387, "grad_norm": 0.7639474868774414, "learning_rate": 0.0002, "epoch": 4.415390362345909, "step": 5910}, {"loss": 1.3862, "grad_norm": 0.7666519284248352, "learning_rate": 0.0002, "epoch": 4.42286141202839, "step": 5920}, {"loss": 1.4168, "grad_norm": 0.867132842540741, "learning_rate": 0.0002, "epoch": 4.43033246171087, "step": 5930}, {"loss": 1.4772, "grad_norm": 0.7571166753768921, "learning_rate": 0.0002, "epoch": 4.4378035113933505, "step": 5940}, {"loss": 1.4401, "grad_norm": 0.7911370992660522, "learning_rate": 0.0002, "epoch": 4.4452745610758315, "step": 5950}, {"loss": 1.4516, "grad_norm": 0.8844250440597534, "learning_rate": 0.0002, "epoch": 4.452745610758312, "step": 5960}, {"loss": 1.4109, "grad_norm": 0.7336231470108032, "learning_rate": 0.0002, "epoch": 4.460216660440792, "step": 5970}, {"loss": 1.3891, "grad_norm": 0.8162738084793091, "learning_rate": 0.0002, "epoch": 4.467687710123272, "step": 5980}, {"loss": 1.393, "grad_norm": 0.7413017153739929, "learning_rate": 0.0002, "epoch": 4.475158759805753, "step": 5990}, {"loss": 1.3712, "grad_norm": 0.7215432524681091, "learning_rate": 0.0002, "epoch": 4.482629809488233, "step": 6000}, {"loss": 1.3521, "grad_norm": 0.8943389058113098, "learning_rate": 0.0002, "epoch": 4.490100859170713, "step": 6010}, {"loss": 1.4172, "grad_norm": 0.7850823998451233, "learning_rate": 0.0002, "epoch": 4.497571908853194, "step": 6020}, {"loss": 1.3582, "grad_norm": 0.8117504119873047, "learning_rate": 0.0002, "epoch": 4.505042958535674, "step": 6030}, {"loss": 1.4272, "grad_norm": 0.8381605744361877, "learning_rate": 0.0002, "epoch": 4.5125140082181545, "step": 6040}, {"loss": 1.3829, "grad_norm": 0.7964059710502625, "learning_rate": 0.0002, "epoch": 4.519985057900635, "step": 6050}, {"loss": 1.3555, "grad_norm": 0.7935128211975098, "learning_rate": 0.0002, "epoch": 4.527456107583116, "step": 6060}, {"loss": 1.3994, "grad_norm": 0.8725124597549438, "learning_rate": 0.0002, "epoch": 4.534927157265596, "step": 6070}, {"loss": 1.3923, "grad_norm": 0.880325198173523, "learning_rate": 0.0002, "epoch": 4.542398206948076, "step": 6080}, {"loss": 1.4459, "grad_norm": 0.7220637202262878, "learning_rate": 0.0002, "epoch": 4.549869256630557, "step": 6090}, {"loss": 1.3281, "grad_norm": 0.6908547878265381, "learning_rate": 0.0002, "epoch": 4.557340306313037, "step": 6100}, {"loss": 1.437, "grad_norm": 0.797931969165802, "learning_rate": 0.0002, "epoch": 4.564811355995517, "step": 6110}, {"loss": 1.4023, "grad_norm": 0.7056134343147278, "learning_rate": 0.0002, "epoch": 4.572282405677997, "step": 6120}, {"loss": 1.3814, "grad_norm": 0.7850478887557983, "learning_rate": 0.0002, "epoch": 4.579753455360478, "step": 6130}, {"loss": 1.3579, "grad_norm": 0.8112621307373047, "learning_rate": 0.0002, "epoch": 4.5872245050429585, "step": 6140}, {"loss": 1.3523, "grad_norm": 0.7040849328041077, "learning_rate": 0.0002, "epoch": 4.594695554725439, "step": 6150}, {"loss": 1.3526, "grad_norm": 0.7214553952217102, "learning_rate": 0.0002, "epoch": 4.60216660440792, "step": 6160}, {"loss": 1.3932, "grad_norm": 0.8616511821746826, "learning_rate": 0.0002, "epoch": 4.6096376540904, "step": 6170}, {"loss": 1.4622, "grad_norm": 0.8374658226966858, "learning_rate": 0.0002, "epoch": 4.61710870377288, "step": 6180}, {"loss": 1.3703, "grad_norm": 0.6761606931686401, "learning_rate": 0.0002, "epoch": 4.62457975345536, "step": 6190}, {"loss": 1.3977, "grad_norm": 0.768028199672699, "learning_rate": 0.0002, "epoch": 4.632050803137841, "step": 6200}, {"loss": 1.3772, "grad_norm": 0.9372717142105103, "learning_rate": 0.0002, "epoch": 4.639521852820321, "step": 6210}, {"loss": 1.4098, "grad_norm": 0.7906546592712402, "learning_rate": 0.0002, "epoch": 4.646992902502801, "step": 6220}, {"loss": 1.3962, "grad_norm": 0.7376723289489746, "learning_rate": 0.0002, "epoch": 4.654463952185282, "step": 6230}, {"loss": 1.4529, "grad_norm": 0.8972630500793457, "learning_rate": 0.0002, "epoch": 4.6619350018677626, "step": 6240}, {"loss": 1.4668, "grad_norm": 0.8261756300926208, "learning_rate": 0.0002, "epoch": 4.669406051550243, "step": 6250}, {"loss": 1.3267, "grad_norm": 0.7512393593788147, "learning_rate": 0.0002, "epoch": 4.676877101232723, "step": 6260}, {"loss": 1.4278, "grad_norm": 0.7132362127304077, "learning_rate": 0.0002, "epoch": 4.684348150915204, "step": 6270}, {"loss": 1.4299, "grad_norm": 0.7690575122833252, "learning_rate": 0.0002, "epoch": 4.691819200597684, "step": 6280}, {"loss": 1.4769, "grad_norm": 0.9886258840560913, "learning_rate": 0.0002, "epoch": 4.699290250280164, "step": 6290}, {"loss": 1.4005, "grad_norm": 0.9502435922622681, "learning_rate": 0.0002, "epoch": 4.706761299962645, "step": 6300}, {"loss": 1.4319, "grad_norm": 0.702255129814148, "learning_rate": 0.0002, "epoch": 4.714232349645125, "step": 6310}, {"loss": 1.4447, "grad_norm": 0.7713103890419006, "learning_rate": 0.0002, "epoch": 4.721703399327605, "step": 6320}, {"loss": 1.4392, "grad_norm": 0.7778580784797668, "learning_rate": 0.0002, "epoch": 4.7291744490100855, "step": 6330}, {"loss": 1.4169, "grad_norm": 0.7275111079216003, "learning_rate": 0.0002, "epoch": 4.736645498692567, "step": 6340}, {"loss": 1.4429, "grad_norm": 0.7728744149208069, "learning_rate": 0.0002, "epoch": 4.744116548375047, "step": 6350}, {"loss": 1.3756, "grad_norm": 0.9724260568618774, "learning_rate": 0.0002, "epoch": 4.751587598057527, "step": 6360}, {"loss": 1.3358, "grad_norm": 0.7505622506141663, "learning_rate": 0.0002, "epoch": 4.759058647740007, "step": 6370}, {"loss": 1.379, "grad_norm": 0.7994682788848877, "learning_rate": 0.0002, "epoch": 4.766529697422488, "step": 6380}, {"loss": 1.4275, "grad_norm": 0.8432038426399231, "learning_rate": 0.0002, "epoch": 4.774000747104968, "step": 6390}, {"loss": 1.4606, "grad_norm": 0.7436022758483887, "learning_rate": 0.0002, "epoch": 4.781471796787448, "step": 6400}, {"loss": 1.3461, "grad_norm": 0.7709194421768188, "learning_rate": 0.0002, "epoch": 4.788942846469929, "step": 6410}, {"loss": 1.3715, "grad_norm": 0.8798436522483826, "learning_rate": 0.0002, "epoch": 4.796413896152409, "step": 6420}, {"loss": 1.3761, "grad_norm": 0.790189266204834, "learning_rate": 0.0002, "epoch": 4.80388494583489, "step": 6430}, {"loss": 1.4109, "grad_norm": 0.6824303865432739, "learning_rate": 0.0002, "epoch": 4.811355995517371, "step": 6440}, {"loss": 1.3877, "grad_norm": 0.7501044869422913, "learning_rate": 0.0002, "epoch": 4.818827045199851, "step": 6450}, {"loss": 1.4458, "grad_norm": 0.8840398192405701, "learning_rate": 0.0002, "epoch": 4.826298094882331, "step": 6460}, {"loss": 1.4412, "grad_norm": 0.7812688946723938, "learning_rate": 0.0002, "epoch": 4.833769144564811, "step": 6470}, {"loss": 1.4299, "grad_norm": 0.7429926991462708, "learning_rate": 0.0002, "epoch": 4.841240194247292, "step": 6480}, {"loss": 1.5062, "grad_norm": 0.7778021693229675, "learning_rate": 0.0002, "epoch": 4.848711243929772, "step": 6490}, {"loss": 1.4589, "grad_norm": 0.8270702362060547, "learning_rate": 0.0002, "epoch": 4.856182293612252, "step": 6500}, {"loss": 1.4091, "grad_norm": 0.6960513591766357, "learning_rate": 0.0002, "epoch": 4.863653343294732, "step": 6510}, {"loss": 1.376, "grad_norm": 0.7728942632675171, "learning_rate": 0.0002, "epoch": 4.8711243929772134, "step": 6520}, {"loss": 1.4852, "grad_norm": 0.7377303838729858, "learning_rate": 0.0002, "epoch": 4.878595442659694, "step": 6530}, {"loss": 1.3846, "grad_norm": 0.7257253527641296, "learning_rate": 0.0002, "epoch": 4.886066492342174, "step": 6540}, {"loss": 1.4166, "grad_norm": 0.7875821590423584, "learning_rate": 0.0002, "epoch": 4.893537542024655, "step": 6550}, {"loss": 1.357, "grad_norm": 0.8346304297447205, "learning_rate": 0.0002, "epoch": 4.901008591707135, "step": 6560}, {"loss": 1.4522, "grad_norm": 0.7710739374160767, "learning_rate": 0.0002, "epoch": 4.908479641389615, "step": 6570}, {"loss": 1.4465, "grad_norm": 0.7015138268470764, "learning_rate": 0.0002, "epoch": 4.915950691072096, "step": 6580}, {"loss": 1.435, "grad_norm": 0.8707432150840759, "learning_rate": 0.0002, "epoch": 4.923421740754576, "step": 6590}, {"loss": 1.2968, "grad_norm": 0.786601185798645, "learning_rate": 0.0002, "epoch": 4.930892790437056, "step": 6600}, {"loss": 1.4385, "grad_norm": 0.978519082069397, "learning_rate": 0.0002, "epoch": 4.938363840119536, "step": 6610}, {"loss": 1.3997, "grad_norm": 0.8102927207946777, "learning_rate": 0.0002, "epoch": 4.9458348898020175, "step": 6620}, {"loss": 1.4859, "grad_norm": 0.7628704309463501, "learning_rate": 0.0002, "epoch": 4.953305939484498, "step": 6630}, {"loss": 1.3774, "grad_norm": 0.8053455352783203, "learning_rate": 0.0002, "epoch": 4.960776989166978, "step": 6640}, {"loss": 1.5092, "grad_norm": 0.8680412173271179, "learning_rate": 0.0002, "epoch": 4.968248038849458, "step": 6650}, {"loss": 1.3978, "grad_norm": 0.7415758371353149, "learning_rate": 0.0002, "epoch": 4.975719088531939, "step": 6660}, {"loss": 1.3793, "grad_norm": 0.7730312347412109, "learning_rate": 0.0002, "epoch": 4.983190138214419, "step": 6670}, {"loss": 1.4863, "grad_norm": 0.7924041152000427, "learning_rate": 0.0002, "epoch": 4.990661187896899, "step": 6680}, {"loss": 1.4137, "grad_norm": 0.8677893877029419, "learning_rate": 0.0002, "epoch": 4.99813223757938, "step": 6690}]} +{"epoch": 6.0, "step": 8031, "epoch_duration": 1900.1275906562805, "total_accumulated_duration": 9420.934909820557, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6181, "grad_norm": 0.4912872612476349, "learning_rate": 0.0002, "epoch": 0.007471049682480389, "step": 10}, {"loss": 2.2606, "grad_norm": 0.4856316149234772, "learning_rate": 0.0002, "epoch": 0.014942099364960777, "step": 20}, {"loss": 2.0957, "grad_norm": 0.47683125734329224, "learning_rate": 0.0002, "epoch": 0.022413149047441166, "step": 30}, {"loss": 1.8908, "grad_norm": 0.515082597732544, "learning_rate": 0.0002, "epoch": 0.029884198729921554, "step": 40}, {"loss": 1.9704, "grad_norm": 0.5299215316772461, "learning_rate": 0.0002, "epoch": 0.03735524841240194, "step": 50}, {"loss": 1.9225, "grad_norm": 0.4951399862766266, "learning_rate": 0.0002, "epoch": 0.04482629809488233, "step": 60}, {"loss": 1.9742, "grad_norm": 0.48079821467399597, "learning_rate": 0.0002, "epoch": 0.05229734777736272, "step": 70}, {"loss": 1.9466, "grad_norm": 0.49402132630348206, "learning_rate": 0.0002, "epoch": 0.05976839745984311, "step": 80}, {"loss": 1.8691, "grad_norm": 0.4778193235397339, "learning_rate": 0.0002, "epoch": 0.0672394471423235, "step": 90}, {"loss": 1.8455, "grad_norm": 0.42472657561302185, "learning_rate": 0.0002, "epoch": 0.07471049682480388, "step": 100}, {"loss": 1.8744, "grad_norm": 0.4433092474937439, "learning_rate": 0.0002, "epoch": 0.08218154650728428, "step": 110}, {"loss": 1.865, "grad_norm": 0.4472862780094147, "learning_rate": 0.0002, "epoch": 0.08965259618976466, "step": 120}, {"loss": 1.9256, "grad_norm": 0.42596298456192017, "learning_rate": 0.0002, "epoch": 0.09712364587224505, "step": 130}, {"loss": 1.8015, "grad_norm": 0.46645811200141907, "learning_rate": 0.0002, "epoch": 0.10459469555472543, "step": 140}, {"loss": 1.8307, "grad_norm": 0.41041234135627747, "learning_rate": 0.0002, "epoch": 0.11206574523720583, "step": 150}, {"loss": 1.8276, "grad_norm": 0.5329819917678833, "learning_rate": 0.0002, "epoch": 0.11953679491968622, "step": 160}, {"loss": 1.8118, "grad_norm": 0.4065922200679779, "learning_rate": 0.0002, "epoch": 0.1270078446021666, "step": 170}, {"loss": 1.8559, "grad_norm": 0.38406994938850403, "learning_rate": 0.0002, "epoch": 0.134478894284647, "step": 180}, {"loss": 1.8647, "grad_norm": 0.4246881306171417, "learning_rate": 0.0002, "epoch": 0.14194994396712737, "step": 190}, {"loss": 1.8054, "grad_norm": 0.35136649012565613, "learning_rate": 0.0002, "epoch": 0.14942099364960776, "step": 200}, {"loss": 1.802, "grad_norm": 0.43252742290496826, "learning_rate": 0.0002, "epoch": 0.15689204333208817, "step": 210}, {"loss": 1.7823, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002, "epoch": 0.16436309301456856, "step": 220}, {"loss": 1.818, "grad_norm": 0.3748249113559723, "learning_rate": 0.0002, "epoch": 0.17183414269704894, "step": 230}, {"loss": 1.866, "grad_norm": 0.6432855725288391, "learning_rate": 0.0002, "epoch": 0.17930519237952933, "step": 240}, {"loss": 1.8397, "grad_norm": 0.34874802827835083, "learning_rate": 0.0002, "epoch": 0.1867762420620097, "step": 250}, {"loss": 1.79, "grad_norm": 0.3721984326839447, "learning_rate": 0.0002, "epoch": 0.1942472917444901, "step": 260}, {"loss": 1.8464, "grad_norm": 0.4339311420917511, "learning_rate": 0.0002, "epoch": 0.20171834142697048, "step": 270}, {"loss": 1.8665, "grad_norm": 0.4018215537071228, "learning_rate": 0.0002, "epoch": 0.20918939110945087, "step": 280}, {"loss": 1.8048, "grad_norm": 0.3278839886188507, "learning_rate": 0.0002, "epoch": 0.21666044079193125, "step": 290}, {"loss": 1.7395, "grad_norm": 0.36146077513694763, "learning_rate": 0.0002, "epoch": 0.22413149047441167, "step": 300}, {"loss": 1.7916, "grad_norm": 0.38175010681152344, "learning_rate": 0.0002, "epoch": 0.23160254015689205, "step": 310}, {"loss": 1.8593, "grad_norm": 0.44776618480682373, "learning_rate": 0.0002, "epoch": 0.23907358983937244, "step": 320}, {"loss": 1.7824, "grad_norm": 0.3933652937412262, "learning_rate": 0.0002, "epoch": 0.24654463952185282, "step": 330}, {"loss": 1.8393, "grad_norm": 0.3515005111694336, "learning_rate": 0.0002, "epoch": 0.2540156892043332, "step": 340}, {"loss": 1.8653, "grad_norm": 0.6683304309844971, "learning_rate": 0.0002, "epoch": 0.2614867388868136, "step": 350}, {"loss": 1.8797, "grad_norm": 0.37093454599380493, "learning_rate": 0.0002, "epoch": 0.268957788569294, "step": 360}, {"loss": 1.8251, "grad_norm": 0.3450651168823242, "learning_rate": 0.0002, "epoch": 0.2764288382517744, "step": 370}, {"loss": 1.7435, "grad_norm": 0.5140917301177979, "learning_rate": 0.0002, "epoch": 0.28389988793425475, "step": 380}, {"loss": 1.8026, "grad_norm": 0.32885563373565674, "learning_rate": 0.0002, "epoch": 0.29137093761673516, "step": 390}, {"loss": 1.8174, "grad_norm": 0.33962297439575195, "learning_rate": 0.0002, "epoch": 0.2988419872992155, "step": 400}, {"loss": 1.7467, "grad_norm": 0.3723141849040985, "learning_rate": 0.0002, "epoch": 0.30631303698169593, "step": 410}, {"loss": 1.8459, "grad_norm": 0.37173134088516235, "learning_rate": 0.0002, "epoch": 0.31378408666417634, "step": 420}, {"loss": 1.8876, "grad_norm": 0.33736956119537354, "learning_rate": 0.0002, "epoch": 0.3212551363466567, "step": 430}, {"loss": 1.8367, "grad_norm": 0.3602448105812073, "learning_rate": 0.0002, "epoch": 0.3287261860291371, "step": 440}, {"loss": 1.8058, "grad_norm": 0.3569699227809906, "learning_rate": 0.0002, "epoch": 0.33619723571161747, "step": 450}, {"loss": 1.8086, "grad_norm": 0.31009167432785034, "learning_rate": 0.0002, "epoch": 0.3436682853940979, "step": 460}, {"loss": 1.8876, "grad_norm": 0.5278693437576294, "learning_rate": 0.0002, "epoch": 0.35113933507657824, "step": 470}, {"loss": 1.8534, "grad_norm": 0.3587537109851837, "learning_rate": 0.0002, "epoch": 0.35861038475905865, "step": 480}, {"loss": 1.8046, "grad_norm": 0.3859670162200928, "learning_rate": 0.0002, "epoch": 0.366081434441539, "step": 490}, {"loss": 1.8287, "grad_norm": 0.395913690328598, "learning_rate": 0.0002, "epoch": 0.3735524841240194, "step": 500}, {"loss": 1.7619, "grad_norm": 0.35052940249443054, "learning_rate": 0.0002, "epoch": 0.38102353380649984, "step": 510}, {"loss": 1.7824, "grad_norm": 0.2979494333267212, "learning_rate": 0.0002, "epoch": 0.3884945834889802, "step": 520}, {"loss": 1.8641, "grad_norm": 0.3062683343887329, "learning_rate": 0.0002, "epoch": 0.3959656331714606, "step": 530}, {"loss": 1.7651, "grad_norm": 0.3172847330570221, "learning_rate": 0.0002, "epoch": 0.40343668285394096, "step": 540}, {"loss": 1.806, "grad_norm": 0.360435426235199, "learning_rate": 0.0002, "epoch": 0.4109077325364214, "step": 550}, {"loss": 1.9054, "grad_norm": 0.3427872359752655, "learning_rate": 0.0002, "epoch": 0.41837878221890173, "step": 560}, {"loss": 1.7562, "grad_norm": 0.34036558866500854, "learning_rate": 0.0002, "epoch": 0.42584983190138215, "step": 570}, {"loss": 1.7254, "grad_norm": 0.3365345299243927, "learning_rate": 0.0002, "epoch": 0.4333208815838625, "step": 580}, {"loss": 1.8328, "grad_norm": 0.35619041323661804, "learning_rate": 0.0002, "epoch": 0.4407919312663429, "step": 590}, {"loss": 1.8114, "grad_norm": 0.3569088280200958, "learning_rate": 0.0002, "epoch": 0.44826298094882333, "step": 600}, {"loss": 1.8599, "grad_norm": 0.3581278622150421, "learning_rate": 0.0002, "epoch": 0.4557340306313037, "step": 610}, {"loss": 1.7078, "grad_norm": 0.43197110295295715, "learning_rate": 0.0002, "epoch": 0.4632050803137841, "step": 620}, {"loss": 1.8257, "grad_norm": 0.33966198563575745, "learning_rate": 0.0002, "epoch": 0.47067612999626446, "step": 630}, {"loss": 1.7528, "grad_norm": 0.3343866467475891, "learning_rate": 0.0002, "epoch": 0.47814717967874487, "step": 640}, {"loss": 1.8191, "grad_norm": 0.33878564834594727, "learning_rate": 0.0002, "epoch": 0.48561822936122523, "step": 650}, {"loss": 1.8801, "grad_norm": 0.387195885181427, "learning_rate": 0.0002, "epoch": 0.49308927904370564, "step": 660}, {"loss": 1.7559, "grad_norm": 0.3755440413951874, "learning_rate": 0.0002, "epoch": 0.500560328726186, "step": 670}, {"loss": 1.8057, "grad_norm": 0.3272816836833954, "learning_rate": 0.0002, "epoch": 0.5080313784086664, "step": 680}, {"loss": 1.8156, "grad_norm": 0.36063864827156067, "learning_rate": 0.0002, "epoch": 0.5155024280911468, "step": 690}, {"loss": 1.8397, "grad_norm": 0.35317373275756836, "learning_rate": 0.0002, "epoch": 0.5229734777736272, "step": 700}, {"loss": 1.7603, "grad_norm": 0.3561195433139801, "learning_rate": 0.0002, "epoch": 0.5304445274561076, "step": 710}, {"loss": 1.8149, "grad_norm": 0.31124624609947205, "learning_rate": 0.0002, "epoch": 0.537915577138588, "step": 720}, {"loss": 1.7434, "grad_norm": 0.3294544517993927, "learning_rate": 0.0002, "epoch": 0.5453866268210683, "step": 730}, {"loss": 1.8027, "grad_norm": 0.31933900713920593, "learning_rate": 0.0002, "epoch": 0.5528576765035488, "step": 740}, {"loss": 1.7601, "grad_norm": 0.3226020634174347, "learning_rate": 0.0002, "epoch": 0.5603287261860291, "step": 750}, {"loss": 1.7862, "grad_norm": 0.3147525489330292, "learning_rate": 0.0002, "epoch": 0.5677997758685095, "step": 760}, {"loss": 1.9028, "grad_norm": 0.32234328985214233, "learning_rate": 0.0002, "epoch": 0.57527082555099, "step": 770}, {"loss": 1.7623, "grad_norm": 0.3258664309978485, "learning_rate": 0.0002, "epoch": 0.5827418752334703, "step": 780}, {"loss": 1.7384, "grad_norm": 0.3166961967945099, "learning_rate": 0.0002, "epoch": 0.5902129249159507, "step": 790}, {"loss": 1.8799, "grad_norm": 0.35621458292007446, "learning_rate": 0.0002, "epoch": 0.597683974598431, "step": 800}, {"loss": 1.8313, "grad_norm": 0.3236999213695526, "learning_rate": 0.0002, "epoch": 0.6051550242809115, "step": 810}, {"loss": 1.7132, "grad_norm": 0.2892923653125763, "learning_rate": 0.0002, "epoch": 0.6126260739633919, "step": 820}, {"loss": 1.8709, "grad_norm": 0.4098321497440338, "learning_rate": 0.0002, "epoch": 0.6200971236458722, "step": 830}, {"loss": 1.7637, "grad_norm": 0.3337118923664093, "learning_rate": 0.0002, "epoch": 0.6275681733283527, "step": 840}, {"loss": 1.7375, "grad_norm": 0.30416029691696167, "learning_rate": 0.0002, "epoch": 0.635039223010833, "step": 850}, {"loss": 1.7419, "grad_norm": 0.3361026346683502, "learning_rate": 0.0002, "epoch": 0.6425102726933134, "step": 860}, {"loss": 1.732, "grad_norm": 0.3537365198135376, "learning_rate": 0.0002, "epoch": 0.6499813223757938, "step": 870}, {"loss": 1.7825, "grad_norm": 0.33854469656944275, "learning_rate": 0.0002, "epoch": 0.6574523720582742, "step": 880}, {"loss": 1.7561, "grad_norm": 0.3332272469997406, "learning_rate": 0.0002, "epoch": 0.6649234217407546, "step": 890}, {"loss": 1.7247, "grad_norm": 0.34954726696014404, "learning_rate": 0.0002, "epoch": 0.6723944714232349, "step": 900}, {"loss": 1.7917, "grad_norm": 0.2921750247478485, "learning_rate": 0.0002, "epoch": 0.6798655211057153, "step": 910}, {"loss": 1.7807, "grad_norm": 0.30508682131767273, "learning_rate": 0.0002, "epoch": 0.6873365707881958, "step": 920}, {"loss": 1.8082, "grad_norm": 0.32268425822257996, "learning_rate": 0.0002, "epoch": 0.6948076204706761, "step": 930}, {"loss": 1.8283, "grad_norm": 0.2844390869140625, "learning_rate": 0.0002, "epoch": 0.7022786701531565, "step": 940}, {"loss": 1.7363, "grad_norm": 0.31263890862464905, "learning_rate": 0.0002, "epoch": 0.709749719835637, "step": 950}, {"loss": 1.8081, "grad_norm": 0.3626808822154999, "learning_rate": 0.0002, "epoch": 0.7172207695181173, "step": 960}, {"loss": 1.853, "grad_norm": 0.3322749733924866, "learning_rate": 0.0002, "epoch": 0.7246918192005977, "step": 970}, {"loss": 1.7912, "grad_norm": 0.29177871346473694, "learning_rate": 0.0002, "epoch": 0.732162868883078, "step": 980}, {"loss": 1.8447, "grad_norm": 0.35405513644218445, "learning_rate": 0.0002, "epoch": 0.7396339185655585, "step": 990}, {"loss": 1.7008, "grad_norm": 0.39318400621414185, "learning_rate": 0.0002, "epoch": 0.7471049682480388, "step": 1000}, {"loss": 1.7803, "grad_norm": 0.29401418566703796, "learning_rate": 0.0002, "epoch": 0.7545760179305192, "step": 1010}, {"loss": 1.7649, "grad_norm": 0.3271748721599579, "learning_rate": 0.0002, "epoch": 0.7620470676129997, "step": 1020}, {"loss": 1.7266, "grad_norm": 0.30883970856666565, "learning_rate": 0.0002, "epoch": 0.76951811729548, "step": 1030}, {"loss": 1.7722, "grad_norm": 0.3411838412284851, "learning_rate": 0.0002, "epoch": 0.7769891669779604, "step": 1040}, {"loss": 1.829, "grad_norm": 0.30608129501342773, "learning_rate": 0.0002, "epoch": 0.7844602166604407, "step": 1050}, {"loss": 1.7815, "grad_norm": 0.30899080634117126, "learning_rate": 0.0002, "epoch": 0.7919312663429212, "step": 1060}, {"loss": 1.7625, "grad_norm": 0.3160453140735626, "learning_rate": 0.0002, "epoch": 0.7994023160254016, "step": 1070}, {"loss": 1.8452, "grad_norm": 0.30947187542915344, "learning_rate": 0.0002, "epoch": 0.8068733657078819, "step": 1080}, {"loss": 1.7418, "grad_norm": 0.3103134036064148, "learning_rate": 0.0002, "epoch": 0.8143444153903624, "step": 1090}, {"loss": 1.842, "grad_norm": 0.31771138310432434, "learning_rate": 0.0002, "epoch": 0.8218154650728428, "step": 1100}, {"loss": 1.7918, "grad_norm": 0.5860997438430786, "learning_rate": 0.0002, "epoch": 0.8292865147553231, "step": 1110}, {"loss": 1.8443, "grad_norm": 0.3230148255825043, "learning_rate": 0.0002, "epoch": 0.8367575644378035, "step": 1120}, {"loss": 1.8478, "grad_norm": 0.29611510038375854, "learning_rate": 0.0002, "epoch": 0.8442286141202839, "step": 1130}, {"loss": 1.7673, "grad_norm": 0.3373654782772064, "learning_rate": 0.0002, "epoch": 0.8516996638027643, "step": 1140}, {"loss": 1.7997, "grad_norm": 0.3474279046058655, "learning_rate": 0.0002, "epoch": 0.8591707134852447, "step": 1150}, {"loss": 1.75, "grad_norm": 0.35057875514030457, "learning_rate": 0.0002, "epoch": 0.866641763167725, "step": 1160}, {"loss": 1.8273, "grad_norm": 0.39537495374679565, "learning_rate": 0.0002, "epoch": 0.8741128128502055, "step": 1170}, {"loss": 1.7682, "grad_norm": 0.3714233636856079, "learning_rate": 0.0002, "epoch": 0.8815838625326858, "step": 1180}, {"loss": 1.7549, "grad_norm": 0.2950296998023987, "learning_rate": 0.0002, "epoch": 0.8890549122151662, "step": 1190}, {"loss": 1.7612, "grad_norm": 0.38182979822158813, "learning_rate": 0.0002, "epoch": 0.8965259618976467, "step": 1200}, {"loss": 1.827, "grad_norm": 0.27883678674697876, "learning_rate": 0.0002, "epoch": 0.903997011580127, "step": 1210}, {"loss": 1.7623, "grad_norm": 0.33874374628067017, "learning_rate": 0.0002, "epoch": 0.9114680612626074, "step": 1220}, {"loss": 1.7334, "grad_norm": 0.3014272153377533, "learning_rate": 0.0002, "epoch": 0.9189391109450877, "step": 1230}, {"loss": 1.8235, "grad_norm": 0.3194271922111511, "learning_rate": 0.0002, "epoch": 0.9264101606275682, "step": 1240}, {"loss": 1.7924, "grad_norm": 0.3049403429031372, "learning_rate": 0.0002, "epoch": 0.9338812103100486, "step": 1250}, {"loss": 1.7535, "grad_norm": 0.30621254444122314, "learning_rate": 0.0002, "epoch": 0.9413522599925289, "step": 1260}, {"loss": 1.8287, "grad_norm": 0.28675132989883423, "learning_rate": 0.0002, "epoch": 0.9488233096750094, "step": 1270}, {"loss": 1.7586, "grad_norm": 0.3322032690048218, "learning_rate": 0.0002, "epoch": 0.9562943593574897, "step": 1280}, {"loss": 1.8054, "grad_norm": 0.35408294200897217, "learning_rate": 0.0002, "epoch": 0.9637654090399701, "step": 1290}, {"loss": 1.7343, "grad_norm": 0.36386919021606445, "learning_rate": 0.0002, "epoch": 0.9712364587224505, "step": 1300}, {"loss": 1.8633, "grad_norm": 0.32338324189186096, "learning_rate": 0.0002, "epoch": 0.9787075084049309, "step": 1310}, {"loss": 1.7724, "grad_norm": 0.3714013993740082, "learning_rate": 0.0002, "epoch": 0.9861785580874113, "step": 1320}, {"loss": 1.7766, "grad_norm": 0.3133082389831543, "learning_rate": 0.0002, "epoch": 0.9936496077698916, "step": 1330}, {"eval_loss": 1.8051470518112183, "eval_runtime": 38.6332, "eval_samples_per_second": 13.331, "eval_steps_per_second": 1.682, "epoch": 0.9996264475158759, "step": 1338}, {"loss": 1.8035, "grad_norm": 0.31595754623413086, "learning_rate": 0.0002, "epoch": 1.001120657452372, "step": 1340}, {"loss": 1.7486, "grad_norm": 0.3095700144767761, "learning_rate": 0.0002, "epoch": 1.0085917071348525, "step": 1350}, {"loss": 1.6981, "grad_norm": 0.34677496552467346, "learning_rate": 0.0002, "epoch": 1.0160627568173328, "step": 1360}, {"loss": 1.7377, "grad_norm": 0.29108840227127075, "learning_rate": 0.0002, "epoch": 1.0235338064998132, "step": 1370}, {"loss": 1.7194, "grad_norm": 0.32356950640678406, "learning_rate": 0.0002, "epoch": 1.0310048561822935, "step": 1380}, {"loss": 1.7593, "grad_norm": 0.4200669229030609, "learning_rate": 0.0002, "epoch": 1.038475905864774, "step": 1390}, {"loss": 1.797, "grad_norm": 0.3283711373806, "learning_rate": 0.0002, "epoch": 1.0459469555472545, "step": 1400}, {"loss": 1.7163, "grad_norm": 0.32898256182670593, "learning_rate": 0.0002, "epoch": 1.0534180052297348, "step": 1410}, {"loss": 1.7559, "grad_norm": 0.38790300488471985, "learning_rate": 0.0002, "epoch": 1.0608890549122152, "step": 1420}, {"loss": 1.6922, "grad_norm": 0.339800089597702, "learning_rate": 0.0002, "epoch": 1.0683601045946955, "step": 1430}, {"loss": 1.7076, "grad_norm": 0.3548751175403595, "learning_rate": 0.0002, "epoch": 1.075831154277176, "step": 1440}, {"loss": 1.6985, "grad_norm": 0.35114359855651855, "learning_rate": 0.0002, "epoch": 1.0833022039596563, "step": 1450}, {"loss": 1.7217, "grad_norm": 0.35226720571517944, "learning_rate": 0.0002, "epoch": 1.0907732536421366, "step": 1460}, {"loss": 1.6822, "grad_norm": 0.33665576577186584, "learning_rate": 0.0002, "epoch": 1.0982443033246172, "step": 1470}, {"loss": 1.6699, "grad_norm": 0.363889217376709, "learning_rate": 0.0002, "epoch": 1.1057153530070976, "step": 1480}, {"loss": 1.7933, "grad_norm": 0.3826201856136322, "learning_rate": 0.0002, "epoch": 1.113186402689578, "step": 1490}, {"loss": 1.7022, "grad_norm": 0.34058740735054016, "learning_rate": 0.0002, "epoch": 1.1206574523720583, "step": 1500}, {"loss": 1.6375, "grad_norm": 0.3462134301662445, "learning_rate": 0.0002, "epoch": 1.1281285020545386, "step": 1510}, {"loss": 1.7147, "grad_norm": 0.3396756052970886, "learning_rate": 0.0002, "epoch": 1.135599551737019, "step": 1520}, {"loss": 1.7219, "grad_norm": 0.32004743814468384, "learning_rate": 0.0002, "epoch": 1.1430706014194993, "step": 1530}, {"loss": 1.743, "grad_norm": 0.3397733271121979, "learning_rate": 0.0002, "epoch": 1.15054165110198, "step": 1540}, {"loss": 1.7333, "grad_norm": 0.3783262073993683, "learning_rate": 0.0002, "epoch": 1.1580127007844603, "step": 1550}, {"loss": 1.6075, "grad_norm": 0.35121291875839233, "learning_rate": 0.0002, "epoch": 1.1654837504669406, "step": 1560}, {"loss": 1.678, "grad_norm": 0.35816895961761475, "learning_rate": 0.0002, "epoch": 1.172954800149421, "step": 1570}, {"loss": 1.7143, "grad_norm": 0.33843839168548584, "learning_rate": 0.0002, "epoch": 1.1804258498319014, "step": 1580}, {"loss": 1.7434, "grad_norm": 0.3371972143650055, "learning_rate": 0.0002, "epoch": 1.1878968995143817, "step": 1590}, {"loss": 1.7671, "grad_norm": 0.36016878485679626, "learning_rate": 0.0002, "epoch": 1.195367949196862, "step": 1600}, {"loss": 1.6914, "grad_norm": 0.40879473090171814, "learning_rate": 0.0002, "epoch": 1.2028389988793426, "step": 1610}, {"loss": 1.6955, "grad_norm": 0.3216715455055237, "learning_rate": 0.0002, "epoch": 1.210310048561823, "step": 1620}, {"loss": 1.632, "grad_norm": 0.4482610821723938, "learning_rate": 0.0002, "epoch": 1.2177810982443034, "step": 1630}, {"loss": 1.6999, "grad_norm": 0.3257700502872467, "learning_rate": 0.0002, "epoch": 1.2252521479267837, "step": 1640}, {"loss": 1.7177, "grad_norm": 0.38646459579467773, "learning_rate": 0.0002, "epoch": 1.232723197609264, "step": 1650}, {"loss": 1.7081, "grad_norm": 0.4081360697746277, "learning_rate": 0.0002, "epoch": 1.2401942472917444, "step": 1660}, {"loss": 1.7519, "grad_norm": 0.4326848089694977, "learning_rate": 0.0002, "epoch": 1.2476652969742248, "step": 1670}, {"loss": 1.6752, "grad_norm": 0.346401572227478, "learning_rate": 0.0002, "epoch": 1.2551363466567054, "step": 1680}, {"loss": 1.7425, "grad_norm": 0.34536251425743103, "learning_rate": 0.0002, "epoch": 1.2626073963391857, "step": 1690}, {"loss": 1.7061, "grad_norm": 0.41359591484069824, "learning_rate": 0.0002, "epoch": 1.270078446021666, "step": 1700}, {"loss": 1.7906, "grad_norm": 0.3530874252319336, "learning_rate": 0.0002, "epoch": 1.2775494957041464, "step": 1710}, {"loss": 1.7357, "grad_norm": 0.3702719211578369, "learning_rate": 0.0002, "epoch": 1.2850205453866268, "step": 1720}, {"loss": 1.766, "grad_norm": 0.3703329563140869, "learning_rate": 0.0002, "epoch": 1.2924915950691072, "step": 1730}, {"loss": 1.7221, "grad_norm": 0.37919729948043823, "learning_rate": 0.0002, "epoch": 1.2999626447515875, "step": 1740}, {"loss": 1.7859, "grad_norm": 0.32526856660842896, "learning_rate": 0.0002, "epoch": 1.307433694434068, "step": 1750}, {"loss": 1.7117, "grad_norm": 0.36752620339393616, "learning_rate": 0.0002, "epoch": 1.3149047441165485, "step": 1760}, {"loss": 1.7335, "grad_norm": 0.3398192524909973, "learning_rate": 0.0002, "epoch": 1.3223757937990288, "step": 1770}, {"loss": 1.7492, "grad_norm": 0.37435585260391235, "learning_rate": 0.0002, "epoch": 1.3298468434815092, "step": 1780}, {"loss": 1.7393, "grad_norm": 0.35793280601501465, "learning_rate": 0.0002, "epoch": 1.3373178931639895, "step": 1790}, {"loss": 1.7266, "grad_norm": 0.35481882095336914, "learning_rate": 0.0002, "epoch": 1.3447889428464699, "step": 1800}, {"loss": 1.7456, "grad_norm": 0.3786393105983734, "learning_rate": 0.0002, "epoch": 1.3522599925289502, "step": 1810}, {"loss": 1.7169, "grad_norm": 0.33245593309402466, "learning_rate": 0.0002, "epoch": 1.3597310422114308, "step": 1820}, {"loss": 1.7577, "grad_norm": 0.35388344526290894, "learning_rate": 0.0002, "epoch": 1.3672020918939112, "step": 1830}, {"loss": 1.6968, "grad_norm": 0.3695325553417206, "learning_rate": 0.0002, "epoch": 1.3746731415763915, "step": 1840}, {"loss": 1.7086, "grad_norm": 0.3683604598045349, "learning_rate": 0.0002, "epoch": 1.382144191258872, "step": 1850}, {"loss": 1.7878, "grad_norm": 0.3753012418746948, "learning_rate": 0.0002, "epoch": 1.3896152409413522, "step": 1860}, {"loss": 1.6969, "grad_norm": 0.3331069350242615, "learning_rate": 0.0002, "epoch": 1.3970862906238326, "step": 1870}, {"loss": 1.6644, "grad_norm": 0.3877500295639038, "learning_rate": 0.0002, "epoch": 1.404557340306313, "step": 1880}, {"loss": 1.7586, "grad_norm": 0.33525151014328003, "learning_rate": 0.0002, "epoch": 1.4120283899887935, "step": 1890}, {"loss": 1.7031, "grad_norm": 0.3697299659252167, "learning_rate": 0.0002, "epoch": 1.4194994396712737, "step": 1900}, {"loss": 1.6956, "grad_norm": 0.4029286205768585, "learning_rate": 0.0002, "epoch": 1.4269704893537543, "step": 1910}, {"loss": 1.6897, "grad_norm": 0.3596203029155731, "learning_rate": 0.0002, "epoch": 1.4344415390362346, "step": 1920}, {"loss": 1.7139, "grad_norm": 0.450783908367157, "learning_rate": 0.0002, "epoch": 1.441912588718715, "step": 1930}, {"loss": 1.7243, "grad_norm": 0.3651481866836548, "learning_rate": 0.0002, "epoch": 1.4493836384011953, "step": 1940}, {"loss": 1.6637, "grad_norm": 0.3608424663543701, "learning_rate": 0.0002, "epoch": 1.4568546880836757, "step": 1950}, {"loss": 1.8285, "grad_norm": 0.39684420824050903, "learning_rate": 0.0002, "epoch": 1.4643257377661563, "step": 1960}, {"loss": 1.7514, "grad_norm": 0.34618663787841797, "learning_rate": 0.0002, "epoch": 1.4717967874486364, "step": 1970}, {"loss": 1.6655, "grad_norm": 0.4150386452674866, "learning_rate": 0.0002, "epoch": 1.479267837131117, "step": 1980}, {"loss": 1.7021, "grad_norm": 0.35500776767730713, "learning_rate": 0.0002, "epoch": 1.4867388868135973, "step": 1990}, {"loss": 1.7322, "grad_norm": 0.344144344329834, "learning_rate": 0.0002, "epoch": 1.4942099364960777, "step": 2000}, {"loss": 1.6998, "grad_norm": 0.3340149223804474, "learning_rate": 0.0002, "epoch": 1.501680986178558, "step": 2010}, {"loss": 1.7508, "grad_norm": 0.37685006856918335, "learning_rate": 0.0002, "epoch": 1.5091520358610384, "step": 2020}, {"loss": 1.8299, "grad_norm": 0.3699876368045807, "learning_rate": 0.0002, "epoch": 1.516623085543519, "step": 2030}, {"loss": 1.7357, "grad_norm": 0.3370307385921478, "learning_rate": 0.0002, "epoch": 1.5240941352259991, "step": 2040}, {"loss": 1.8044, "grad_norm": 0.37780630588531494, "learning_rate": 0.0002, "epoch": 1.5315651849084797, "step": 2050}, {"loss": 1.7408, "grad_norm": 0.370259165763855, "learning_rate": 0.0002, "epoch": 1.53903623459096, "step": 2060}, {"loss": 1.7398, "grad_norm": 0.3440011441707611, "learning_rate": 0.0002, "epoch": 1.5465072842734404, "step": 2070}, {"loss": 1.7105, "grad_norm": 0.40382063388824463, "learning_rate": 0.0002, "epoch": 1.5539783339559208, "step": 2080}, {"loss": 1.7071, "grad_norm": 0.38002029061317444, "learning_rate": 0.0002, "epoch": 1.5614493836384011, "step": 2090}, {"loss": 1.6815, "grad_norm": 0.3658451437950134, "learning_rate": 0.0002, "epoch": 1.5689204333208817, "step": 2100}, {"loss": 1.7598, "grad_norm": 0.354842871427536, "learning_rate": 0.0002, "epoch": 1.5763914830033618, "step": 2110}, {"loss": 1.6898, "grad_norm": 0.34735530614852905, "learning_rate": 0.0002, "epoch": 1.5838625326858424, "step": 2120}, {"loss": 1.7363, "grad_norm": 0.377581924200058, "learning_rate": 0.0002, "epoch": 1.5913335823683228, "step": 2130}, {"loss": 1.7789, "grad_norm": 0.41254034638404846, "learning_rate": 0.0002, "epoch": 1.5988046320508031, "step": 2140}, {"loss": 1.6782, "grad_norm": 0.3630715310573578, "learning_rate": 0.0002, "epoch": 1.6062756817332835, "step": 2150}, {"loss": 1.7531, "grad_norm": 0.36980143189430237, "learning_rate": 0.0002, "epoch": 1.6137467314157639, "step": 2160}, {"loss": 1.6847, "grad_norm": 0.3634769320487976, "learning_rate": 0.0002, "epoch": 1.6212177810982444, "step": 2170}, {"loss": 1.6367, "grad_norm": 0.3794139623641968, "learning_rate": 0.0002, "epoch": 1.6286888307807246, "step": 2180}, {"loss": 1.7064, "grad_norm": 0.359742134809494, "learning_rate": 0.0002, "epoch": 1.6361598804632052, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3770543932914734, "learning_rate": 0.0002, "epoch": 1.6436309301456855, "step": 2200}, {"loss": 1.784, "grad_norm": 0.3797036409378052, "learning_rate": 0.0002, "epoch": 1.6511019798281659, "step": 2210}, {"loss": 1.7875, "grad_norm": 0.35622093081474304, "learning_rate": 0.0002, "epoch": 1.6585730295106462, "step": 2220}, {"loss": 1.6615, "grad_norm": 0.34552520513534546, "learning_rate": 0.0002, "epoch": 1.6660440791931266, "step": 2230}, {"loss": 1.7522, "grad_norm": 0.379926860332489, "learning_rate": 0.0002, "epoch": 1.6735151288756072, "step": 2240}, {"loss": 1.7953, "grad_norm": 0.37083810567855835, "learning_rate": 0.0002, "epoch": 1.6809861785580873, "step": 2250}, {"loss": 1.7485, "grad_norm": 0.42746543884277344, "learning_rate": 0.0002, "epoch": 1.6884572282405679, "step": 2260}, {"loss": 1.776, "grad_norm": 0.3372884690761566, "learning_rate": 0.0002, "epoch": 1.6959282779230482, "step": 2270}, {"loss": 1.7604, "grad_norm": 0.35220256447792053, "learning_rate": 0.0002, "epoch": 1.7033993276055286, "step": 2280}, {"loss": 1.7154, "grad_norm": 0.3659130930900574, "learning_rate": 0.0002, "epoch": 1.710870377288009, "step": 2290}, {"loss": 1.6953, "grad_norm": 0.37629297375679016, "learning_rate": 0.0002, "epoch": 1.7183414269704893, "step": 2300}, {"loss": 1.7212, "grad_norm": 0.36312398314476013, "learning_rate": 0.0002, "epoch": 1.7258124766529699, "step": 2310}, {"loss": 1.7903, "grad_norm": 0.467709481716156, "learning_rate": 0.0002, "epoch": 1.73328352633545, "step": 2320}, {"loss": 1.696, "grad_norm": 0.38685527443885803, "learning_rate": 0.0002, "epoch": 1.7407545760179306, "step": 2330}, {"loss": 1.7041, "grad_norm": 0.3578338325023651, "learning_rate": 0.0002, "epoch": 1.748225625700411, "step": 2340}, {"loss": 1.6456, "grad_norm": 0.36057502031326294, "learning_rate": 0.0002, "epoch": 1.7556966753828913, "step": 2350}, {"loss": 1.6853, "grad_norm": 0.3615196645259857, "learning_rate": 0.0002, "epoch": 1.7631677250653717, "step": 2360}, {"loss": 1.7612, "grad_norm": 0.4118947684764862, "learning_rate": 0.0002, "epoch": 1.770638774747852, "step": 2370}, {"loss": 1.6946, "grad_norm": 0.4067276120185852, "learning_rate": 0.0002, "epoch": 1.7781098244303326, "step": 2380}, {"loss": 1.712, "grad_norm": 0.3979823887348175, "learning_rate": 0.0002, "epoch": 1.7855808741128127, "step": 2390}, {"loss": 1.7644, "grad_norm": 0.44045883417129517, "learning_rate": 0.0002, "epoch": 1.7930519237952933, "step": 2400}, {"loss": 1.7251, "grad_norm": 0.3998069167137146, "learning_rate": 0.0002, "epoch": 1.8005229734777737, "step": 2410}, {"loss": 1.7354, "grad_norm": 0.3450094759464264, "learning_rate": 0.0002, "epoch": 1.807994023160254, "step": 2420}, {"loss": 1.6998, "grad_norm": 0.3759009838104248, "learning_rate": 0.0002, "epoch": 1.8154650728427344, "step": 2430}, {"loss": 1.7706, "grad_norm": 0.34347015619277954, "learning_rate": 0.0002, "epoch": 1.8229361225252148, "step": 2440}, {"loss": 1.7345, "grad_norm": 0.3511228859424591, "learning_rate": 0.0002, "epoch": 1.8304071722076953, "step": 2450}, {"loss": 1.6909, "grad_norm": 0.36853715777397156, "learning_rate": 0.0002, "epoch": 1.8378782218901755, "step": 2460}, {"loss": 1.6931, "grad_norm": 0.40659376978874207, "learning_rate": 0.0002, "epoch": 1.845349271572656, "step": 2470}, {"loss": 1.7626, "grad_norm": 0.39621320366859436, "learning_rate": 0.0002, "epoch": 1.8528203212551362, "step": 2480}, {"loss": 1.7427, "grad_norm": 0.3753979504108429, "learning_rate": 0.0002, "epoch": 1.8602913709376168, "step": 2490}, {"loss": 1.6622, "grad_norm": 0.3811938464641571, "learning_rate": 0.0002, "epoch": 1.8677624206200971, "step": 2500}, {"loss": 1.7718, "grad_norm": 0.3432596027851105, "learning_rate": 0.0002, "epoch": 1.8752334703025775, "step": 2510}, {"loss": 1.7488, "grad_norm": 0.3670712113380432, "learning_rate": 0.0002, "epoch": 1.882704519985058, "step": 2520}, {"loss": 1.705, "grad_norm": 0.40907177329063416, "learning_rate": 0.0002, "epoch": 1.8901755696675382, "step": 2530}, {"loss": 1.7148, "grad_norm": 0.3821999728679657, "learning_rate": 0.0002, "epoch": 1.8976466193500188, "step": 2540}, {"loss": 1.7934, "grad_norm": 0.36173978447914124, "learning_rate": 0.0002, "epoch": 1.905117669032499, "step": 2550}, {"loss": 1.6939, "grad_norm": 0.38990336656570435, "learning_rate": 0.0002, "epoch": 1.9125887187149795, "step": 2560}, {"loss": 1.6893, "grad_norm": 0.35242322087287903, "learning_rate": 0.0002, "epoch": 1.9200597683974598, "step": 2570}, {"loss": 1.7268, "grad_norm": 0.3506428003311157, "learning_rate": 0.0002, "epoch": 1.9275308180799402, "step": 2580}, {"loss": 1.6953, "grad_norm": 0.39540135860443115, "learning_rate": 0.0002, "epoch": 1.9350018677624208, "step": 2590}, {"loss": 1.6511, "grad_norm": 0.3444725573062897, "learning_rate": 0.0002, "epoch": 1.942472917444901, "step": 2600}, {"loss": 1.7259, "grad_norm": 0.3963521718978882, "learning_rate": 0.0002, "epoch": 1.9499439671273815, "step": 2610}, {"loss": 1.6946, "grad_norm": 0.3689815402030945, "learning_rate": 0.0002, "epoch": 1.9574150168098616, "step": 2620}, {"loss": 1.7384, "grad_norm": 0.3482626676559448, "learning_rate": 0.0002, "epoch": 1.9648860664923422, "step": 2630}, {"loss": 1.7048, "grad_norm": 0.35832616686820984, "learning_rate": 0.0002, "epoch": 1.9723571161748226, "step": 2640}, {"loss": 1.6681, "grad_norm": 0.4776208996772766, "learning_rate": 0.0002, "epoch": 1.979828165857303, "step": 2650}, {"loss": 1.6696, "grad_norm": 0.32570165395736694, "learning_rate": 0.0002, "epoch": 1.9872992155397835, "step": 2660}, {"loss": 1.7232, "grad_norm": 0.3380725085735321, "learning_rate": 0.0002, "epoch": 1.9947702652222636, "step": 2670}, {"eval_loss": 1.8046749830245972, "eval_runtime": 38.5096, "eval_samples_per_second": 13.373, "eval_steps_per_second": 1.688, "epoch": 2.0, "step": 2677}, {"loss": 1.7265, "grad_norm": 0.36817631125450134, "learning_rate": 0.0002, "epoch": 2.002241314904744, "step": 2680}, {"loss": 1.548, "grad_norm": 0.4056456685066223, "learning_rate": 0.0002, "epoch": 2.0097123645872244, "step": 2690}, {"loss": 1.5515, "grad_norm": 0.37416863441467285, "learning_rate": 0.0002, "epoch": 2.017183414269705, "step": 2700}, {"loss": 1.5895, "grad_norm": 0.4273638427257538, "learning_rate": 0.0002, "epoch": 2.024654463952185, "step": 2710}, {"loss": 1.5884, "grad_norm": 0.36497923731803894, "learning_rate": 0.0002, "epoch": 2.0321255136346656, "step": 2720}, {"loss": 1.6999, "grad_norm": 0.5021994113922119, "learning_rate": 0.0002, "epoch": 2.0395965633171462, "step": 2730}, {"loss": 1.6655, "grad_norm": 0.45896220207214355, "learning_rate": 0.0002, "epoch": 2.0470676129996264, "step": 2740}, {"loss": 1.6305, "grad_norm": 0.3973815143108368, "learning_rate": 0.0002, "epoch": 2.054538662682107, "step": 2750}, {"loss": 1.6301, "grad_norm": 0.4521815776824951, "learning_rate": 0.0002, "epoch": 2.062009712364587, "step": 2760}, {"loss": 1.6189, "grad_norm": 0.42775002121925354, "learning_rate": 0.0002, "epoch": 2.0694807620470677, "step": 2770}, {"loss": 1.6491, "grad_norm": 0.48158586025238037, "learning_rate": 0.0002, "epoch": 2.076951811729548, "step": 2780}, {"loss": 1.6301, "grad_norm": 0.4612371623516083, "learning_rate": 0.0002, "epoch": 2.0844228614120284, "step": 2790}, {"loss": 1.6327, "grad_norm": 0.42536866664886475, "learning_rate": 0.0002, "epoch": 2.091893911094509, "step": 2800}, {"loss": 1.651, "grad_norm": 0.48515772819519043, "learning_rate": 0.0002, "epoch": 2.099364960776989, "step": 2810}, {"loss": 1.6829, "grad_norm": 0.41418662667274475, "learning_rate": 0.0002, "epoch": 2.1068360104594697, "step": 2820}, {"loss": 1.6266, "grad_norm": 0.4683697819709778, "learning_rate": 0.0002, "epoch": 2.11430706014195, "step": 2830}, {"loss": 1.6586, "grad_norm": 0.4484657049179077, "learning_rate": 0.0002, "epoch": 2.1217781098244304, "step": 2840}, {"loss": 1.6483, "grad_norm": 0.6621400713920593, "learning_rate": 0.0002, "epoch": 2.1292491595069105, "step": 2850}, {"loss": 1.5755, "grad_norm": 0.45074811577796936, "learning_rate": 0.0002, "epoch": 2.136720209189391, "step": 2860}, {"loss": 1.6456, "grad_norm": 0.3513113558292389, "learning_rate": 0.0002, "epoch": 2.1441912588718717, "step": 2870}, {"loss": 1.6081, "grad_norm": 0.40411314368247986, "learning_rate": 0.0002, "epoch": 2.151662308554352, "step": 2880}, {"loss": 1.6323, "grad_norm": 0.4121065139770508, "learning_rate": 0.0002, "epoch": 2.1591333582368324, "step": 2890}, {"loss": 1.6324, "grad_norm": 0.44723689556121826, "learning_rate": 0.0002, "epoch": 2.1666044079193125, "step": 2900}, {"loss": 1.5699, "grad_norm": 0.4226122498512268, "learning_rate": 0.0002, "epoch": 2.174075457601793, "step": 2910}, {"loss": 1.5652, "grad_norm": 0.46617650985717773, "learning_rate": 0.0002, "epoch": 2.1815465072842732, "step": 2920}, {"loss": 1.6378, "grad_norm": 0.4506422281265259, "learning_rate": 0.0002, "epoch": 2.189017556966754, "step": 2930}, {"loss": 1.6112, "grad_norm": 0.4892672896385193, "learning_rate": 0.0002, "epoch": 2.1964886066492344, "step": 2940}, {"loss": 1.6176, "grad_norm": 0.44095516204833984, "learning_rate": 0.0002, "epoch": 2.2039596563317145, "step": 2950}, {"loss": 1.6058, "grad_norm": 0.41522109508514404, "learning_rate": 0.0002, "epoch": 2.211430706014195, "step": 2960}, {"loss": 1.5964, "grad_norm": 0.4860858917236328, "learning_rate": 0.0002, "epoch": 2.2189017556966752, "step": 2970}, {"loss": 1.6427, "grad_norm": 0.42662516236305237, "learning_rate": 0.0002, "epoch": 2.226372805379156, "step": 2980}, {"loss": 1.6313, "grad_norm": 0.4390648305416107, "learning_rate": 0.0002, "epoch": 2.233843855061636, "step": 2990}, {"loss": 1.5992, "grad_norm": 0.47515565156936646, "learning_rate": 0.0002, "epoch": 2.2413149047441165, "step": 3000}, {"loss": 1.5563, "grad_norm": 0.4104543924331665, "learning_rate": 0.0002, "epoch": 2.248785954426597, "step": 3010}, {"loss": 1.6895, "grad_norm": 0.4404028654098511, "learning_rate": 0.0002, "epoch": 2.2562570041090773, "step": 3020}, {"loss": 1.6088, "grad_norm": 0.4717366695404053, "learning_rate": 0.0002, "epoch": 2.263728053791558, "step": 3030}, {"loss": 1.7287, "grad_norm": 0.48345857858657837, "learning_rate": 0.0002, "epoch": 2.271199103474038, "step": 3040}, {"loss": 1.681, "grad_norm": 0.5312452912330627, "learning_rate": 0.0002, "epoch": 2.2786701531565186, "step": 3050}, {"loss": 1.5901, "grad_norm": 0.5073099732398987, "learning_rate": 0.0002, "epoch": 2.2861412028389987, "step": 3060}, {"loss": 1.6914, "grad_norm": 0.5027463436126709, "learning_rate": 0.0002, "epoch": 2.2936122525214793, "step": 3070}, {"loss": 1.5862, "grad_norm": 0.5436304807662964, "learning_rate": 0.0002, "epoch": 2.30108330220396, "step": 3080}, {"loss": 1.5763, "grad_norm": 0.4701065123081207, "learning_rate": 0.0002, "epoch": 2.30855435188644, "step": 3090}, {"loss": 1.6177, "grad_norm": 0.46988746523857117, "learning_rate": 0.0002, "epoch": 2.3160254015689206, "step": 3100}, {"loss": 1.6502, "grad_norm": 0.45112869143486023, "learning_rate": 0.0002, "epoch": 2.3234964512514007, "step": 3110}, {"loss": 1.6291, "grad_norm": 0.5173566937446594, "learning_rate": 0.0002, "epoch": 2.3309675009338813, "step": 3120}, {"loss": 1.6743, "grad_norm": 0.40345850586891174, "learning_rate": 0.0002, "epoch": 2.3384385506163614, "step": 3130}, {"loss": 1.621, "grad_norm": 0.4218924939632416, "learning_rate": 0.0002, "epoch": 2.345909600298842, "step": 3140}, {"loss": 1.6341, "grad_norm": 0.41857317090034485, "learning_rate": 0.0002, "epoch": 2.3533806499813226, "step": 3150}, {"loss": 1.6087, "grad_norm": 0.4197218418121338, "learning_rate": 0.0002, "epoch": 2.3608516996638027, "step": 3160}, {"loss": 1.6572, "grad_norm": 0.4260677397251129, "learning_rate": 0.0002, "epoch": 2.3683227493462833, "step": 3170}, {"loss": 1.6376, "grad_norm": 0.4209042191505432, "learning_rate": 0.0002, "epoch": 2.3757937990287634, "step": 3180}, {"loss": 1.634, "grad_norm": 0.4092234969139099, "learning_rate": 0.0002, "epoch": 2.383264848711244, "step": 3190}, {"loss": 1.6339, "grad_norm": 0.4928431510925293, "learning_rate": 0.0002, "epoch": 2.390735898393724, "step": 3200}, {"loss": 1.6015, "grad_norm": 0.49252402782440186, "learning_rate": 0.0002, "epoch": 2.3982069480762047, "step": 3210}, {"loss": 1.5773, "grad_norm": 0.4368397295475006, "learning_rate": 0.0002, "epoch": 2.4056779977586853, "step": 3220}, {"loss": 1.6629, "grad_norm": 0.46122390031814575, "learning_rate": 0.0002, "epoch": 2.4131490474411654, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4272301197052002, "learning_rate": 0.0002, "epoch": 2.420620097123646, "step": 3240}, {"loss": 1.5961, "grad_norm": 0.41480937600135803, "learning_rate": 0.0002, "epoch": 2.428091146806126, "step": 3250}, {"loss": 1.6281, "grad_norm": 0.48911941051483154, "learning_rate": 0.0002, "epoch": 2.4355621964886067, "step": 3260}, {"loss": 1.6846, "grad_norm": 0.4444098472595215, "learning_rate": 0.0002, "epoch": 2.443033246171087, "step": 3270}, {"loss": 1.6961, "grad_norm": 0.5111684799194336, "learning_rate": 0.0002, "epoch": 2.4505042958535674, "step": 3280}, {"loss": 1.6152, "grad_norm": 0.5058825016021729, "learning_rate": 0.0002, "epoch": 2.457975345536048, "step": 3290}, {"loss": 1.625, "grad_norm": 0.44173210859298706, "learning_rate": 0.0002, "epoch": 2.465446395218528, "step": 3300}, {"loss": 1.6491, "grad_norm": 0.4659745991230011, "learning_rate": 0.0002, "epoch": 2.4729174449010087, "step": 3310}, {"loss": 1.6114, "grad_norm": 0.47237497568130493, "learning_rate": 0.0002, "epoch": 2.480388494583489, "step": 3320}, {"loss": 1.6193, "grad_norm": 0.47303131222724915, "learning_rate": 0.0002, "epoch": 2.4878595442659694, "step": 3330}, {"loss": 1.7256, "grad_norm": 0.4522389769554138, "learning_rate": 0.0002, "epoch": 2.4953305939484496, "step": 3340}, {"loss": 1.6834, "grad_norm": 0.4467332363128662, "learning_rate": 0.0002, "epoch": 2.50280164363093, "step": 3350}, {"loss": 1.6108, "grad_norm": 0.4413762092590332, "learning_rate": 0.0002, "epoch": 2.5102726933134107, "step": 3360}, {"loss": 1.537, "grad_norm": 0.495514452457428, "learning_rate": 0.0002, "epoch": 2.517743742995891, "step": 3370}, {"loss": 1.5839, "grad_norm": 0.4429773986339569, "learning_rate": 0.0002, "epoch": 2.5252147926783715, "step": 3380}, {"loss": 1.6522, "grad_norm": 0.4589079022407532, "learning_rate": 0.0002, "epoch": 2.5326858423608516, "step": 3390}, {"loss": 1.6529, "grad_norm": 0.4683997333049774, "learning_rate": 0.0002, "epoch": 2.540156892043332, "step": 3400}, {"loss": 1.6745, "grad_norm": 0.4651731252670288, "learning_rate": 0.0002, "epoch": 2.5476279417258123, "step": 3410}, {"loss": 1.5918, "grad_norm": 0.45818084478378296, "learning_rate": 0.0002, "epoch": 2.555098991408293, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.45209529995918274, "learning_rate": 0.0002, "epoch": 2.5625700410907735, "step": 3430}, {"loss": 1.5606, "grad_norm": 0.4344733655452728, "learning_rate": 0.0002, "epoch": 2.5700410907732536, "step": 3440}, {"loss": 1.6748, "grad_norm": 0.47435566782951355, "learning_rate": 0.0002, "epoch": 2.577512140455734, "step": 3450}, {"loss": 1.6237, "grad_norm": 0.43841999769210815, "learning_rate": 0.0002, "epoch": 2.5849831901382143, "step": 3460}, {"loss": 1.7207, "grad_norm": 0.4323869049549103, "learning_rate": 0.0002, "epoch": 2.592454239820695, "step": 3470}, {"loss": 1.5494, "grad_norm": 0.44355881214141846, "learning_rate": 0.0002, "epoch": 2.599925289503175, "step": 3480}, {"loss": 1.665, "grad_norm": 0.45847779512405396, "learning_rate": 0.0002, "epoch": 2.6073963391856556, "step": 3490}, {"loss": 1.6006, "grad_norm": 0.4411061704158783, "learning_rate": 0.0002, "epoch": 2.614867388868136, "step": 3500}, {"loss": 1.5868, "grad_norm": 0.4446796178817749, "learning_rate": 0.0002, "epoch": 2.6223384385506163, "step": 3510}, {"loss": 1.5946, "grad_norm": 0.41969653964042664, "learning_rate": 0.0002, "epoch": 2.629809488233097, "step": 3520}, {"loss": 1.6798, "grad_norm": 0.5263747572898865, "learning_rate": 0.0002, "epoch": 2.637280537915577, "step": 3530}, {"loss": 1.6309, "grad_norm": 0.47719451785087585, "learning_rate": 0.0002, "epoch": 2.6447515875980576, "step": 3540}, {"loss": 1.7024, "grad_norm": 0.46574118733406067, "learning_rate": 0.0002, "epoch": 2.6522226372805378, "step": 3550}, {"loss": 1.618, "grad_norm": 0.46867135167121887, "learning_rate": 0.0002, "epoch": 2.6596936869630183, "step": 3560}, {"loss": 1.5885, "grad_norm": 0.4441198706626892, "learning_rate": 0.0002, "epoch": 2.667164736645499, "step": 3570}, {"loss": 1.6426, "grad_norm": 0.4871319830417633, "learning_rate": 0.0002, "epoch": 2.674635786327979, "step": 3580}, {"loss": 1.6575, "grad_norm": 0.43900373578071594, "learning_rate": 0.0002, "epoch": 2.6821068360104596, "step": 3590}, {"loss": 1.6071, "grad_norm": 0.42509549856185913, "learning_rate": 0.0002, "epoch": 2.6895778856929398, "step": 3600}, {"loss": 1.5651, "grad_norm": 0.4691086709499359, "learning_rate": 0.0002, "epoch": 2.6970489353754203, "step": 3610}, {"loss": 1.5491, "grad_norm": 0.46318942308425903, "learning_rate": 0.0002, "epoch": 2.7045199850579005, "step": 3620}, {"loss": 1.5422, "grad_norm": 0.44631096720695496, "learning_rate": 0.0002, "epoch": 2.711991034740381, "step": 3630}, {"loss": 1.6831, "grad_norm": 0.42315489053726196, "learning_rate": 0.0002, "epoch": 2.7194620844228616, "step": 3640}, {"loss": 1.6008, "grad_norm": 0.4971241056919098, "learning_rate": 0.0002, "epoch": 2.7269331341053418, "step": 3650}, {"loss": 1.6042, "grad_norm": 0.4578486382961273, "learning_rate": 0.0002, "epoch": 2.7344041837878224, "step": 3660}, {"loss": 1.6076, "grad_norm": 0.46584776043891907, "learning_rate": 0.0002, "epoch": 2.7418752334703025, "step": 3670}, {"loss": 1.6809, "grad_norm": 0.4951731264591217, "learning_rate": 0.0002, "epoch": 2.749346283152783, "step": 3680}, {"loss": 1.6226, "grad_norm": 0.4935225546360016, "learning_rate": 0.0002, "epoch": 2.756817332835263, "step": 3690}, {"loss": 1.5878, "grad_norm": 0.41805586218833923, "learning_rate": 0.0002, "epoch": 2.764288382517744, "step": 3700}, {"loss": 1.7173, "grad_norm": 0.4417555630207062, "learning_rate": 0.0002, "epoch": 2.7717594322002244, "step": 3710}, {"loss": 1.6398, "grad_norm": 0.48229655623435974, "learning_rate": 0.0002, "epoch": 2.7792304818827045, "step": 3720}, {"loss": 1.6074, "grad_norm": 0.48562315106391907, "learning_rate": 0.0002, "epoch": 2.786701531565185, "step": 3730}, {"loss": 1.607, "grad_norm": 0.4473940432071686, "learning_rate": 0.0002, "epoch": 2.794172581247665, "step": 3740}, {"loss": 1.6065, "grad_norm": 0.4626813232898712, "learning_rate": 0.0002, "epoch": 2.801643630930146, "step": 3750}, {"loss": 1.6296, "grad_norm": 0.4339792728424072, "learning_rate": 0.0002, "epoch": 2.809114680612626, "step": 3760}, {"loss": 1.6815, "grad_norm": 0.5250858068466187, "learning_rate": 0.0002, "epoch": 2.8165857302951065, "step": 3770}, {"loss": 1.6644, "grad_norm": 0.4537523090839386, "learning_rate": 0.0002, "epoch": 2.824056779977587, "step": 3780}, {"loss": 1.6535, "grad_norm": 0.5646113157272339, "learning_rate": 0.0002, "epoch": 2.831527829660067, "step": 3790}, {"loss": 1.5712, "grad_norm": 0.44243332743644714, "learning_rate": 0.0002, "epoch": 2.8389988793425474, "step": 3800}, {"loss": 1.6478, "grad_norm": 0.4585791826248169, "learning_rate": 0.0002, "epoch": 2.846469929025028, "step": 3810}, {"loss": 1.6854, "grad_norm": 0.489702045917511, "learning_rate": 0.0002, "epoch": 2.8539409787075085, "step": 3820}, {"loss": 1.7066, "grad_norm": 0.502470850944519, "learning_rate": 0.0002, "epoch": 2.8614120283899886, "step": 3830}, {"loss": 1.5785, "grad_norm": 0.4395960867404938, "learning_rate": 0.0002, "epoch": 2.8688830780724692, "step": 3840}, {"loss": 1.6434, "grad_norm": 0.4348670244216919, "learning_rate": 0.0002, "epoch": 2.87635412775495, "step": 3850}, {"loss": 1.6163, "grad_norm": 0.48852720856666565, "learning_rate": 0.0002, "epoch": 2.88382517743743, "step": 3860}, {"loss": 1.5916, "grad_norm": 0.45317450165748596, "learning_rate": 0.0002, "epoch": 2.89129622711991, "step": 3870}, {"loss": 1.6486, "grad_norm": 0.4732758700847626, "learning_rate": 0.0002, "epoch": 2.8987672768023907, "step": 3880}, {"loss": 1.6758, "grad_norm": 0.45238012075424194, "learning_rate": 0.0002, "epoch": 2.9062383264848712, "step": 3890}, {"loss": 1.6228, "grad_norm": 0.48838064074516296, "learning_rate": 0.0002, "epoch": 2.9137093761673514, "step": 3900}, {"loss": 1.658, "grad_norm": 0.43496349453926086, "learning_rate": 0.0002, "epoch": 2.921180425849832, "step": 3910}, {"loss": 1.7063, "grad_norm": 0.47963935136795044, "learning_rate": 0.0002, "epoch": 2.9286514755323125, "step": 3920}, {"loss": 1.6553, "grad_norm": 0.4544987976551056, "learning_rate": 0.0002, "epoch": 2.9361225252147927, "step": 3930}, {"loss": 1.6192, "grad_norm": 0.4622892141342163, "learning_rate": 0.0002, "epoch": 2.943593574897273, "step": 3940}, {"loss": 1.6178, "grad_norm": 0.47026222944259644, "learning_rate": 0.0002, "epoch": 2.9510646245797534, "step": 3950}, {"loss": 1.6612, "grad_norm": 0.4549552798271179, "learning_rate": 0.0002, "epoch": 2.958535674262234, "step": 3960}, {"loss": 1.6458, "grad_norm": 0.46647515892982483, "learning_rate": 0.0002, "epoch": 2.966006723944714, "step": 3970}, {"loss": 1.6051, "grad_norm": 0.45095112919807434, "learning_rate": 0.0002, "epoch": 2.9734777736271947, "step": 3980}, {"loss": 1.6471, "grad_norm": 0.4690017104148865, "learning_rate": 0.0002, "epoch": 2.9809488233096753, "step": 3990}, {"loss": 1.6061, "grad_norm": 0.4603444039821625, "learning_rate": 0.0002, "epoch": 2.9884198729921554, "step": 4000}, {"loss": 1.6431, "grad_norm": 0.4743294417858124, "learning_rate": 0.0002, "epoch": 2.9958909226746355, "step": 4010}, {"eval_loss": 1.8252571821212769, "eval_runtime": 38.7853, "eval_samples_per_second": 13.278, "eval_steps_per_second": 1.676, "epoch": 2.999626447515876, "step": 4015}, {"loss": 1.6512, "grad_norm": 0.4919724464416504, "learning_rate": 0.0002, "epoch": 3.003361972357116, "step": 4020}, {"loss": 1.5354, "grad_norm": 0.4747185707092285, "learning_rate": 0.0002, "epoch": 3.0108330220395967, "step": 4030}, {"loss": 1.568, "grad_norm": 0.4797595143318176, "learning_rate": 0.0002, "epoch": 3.018304071722077, "step": 4040}, {"loss": 1.5194, "grad_norm": 0.5450999140739441, "learning_rate": 0.0002, "epoch": 3.0257751214045574, "step": 4050}, {"loss": 1.5065, "grad_norm": 0.49058812856674194, "learning_rate": 0.0002, "epoch": 3.0332461710870375, "step": 4060}, {"loss": 1.4884, "grad_norm": 0.5219563841819763, "learning_rate": 0.0002, "epoch": 3.040717220769518, "step": 4070}, {"loss": 1.4742, "grad_norm": 0.515628457069397, "learning_rate": 0.0002, "epoch": 3.0481882704519987, "step": 4080}, {"loss": 1.5313, "grad_norm": 0.6145984530448914, "learning_rate": 0.0002, "epoch": 3.055659320134479, "step": 4090}, {"loss": 1.4989, "grad_norm": 0.6067144274711609, "learning_rate": 0.0002, "epoch": 3.0631303698169594, "step": 4100}, {"loss": 1.528, "grad_norm": 0.5773133039474487, "learning_rate": 0.0002, "epoch": 3.0706014194994395, "step": 4110}, {"loss": 1.5374, "grad_norm": 0.6894241571426392, "learning_rate": 0.0002, "epoch": 3.07807246918192, "step": 4120}, {"loss": 1.5422, "grad_norm": 0.6422514915466309, "learning_rate": 0.0002, "epoch": 3.0855435188644003, "step": 4130}, {"loss": 1.4724, "grad_norm": 0.6119855046272278, "learning_rate": 0.0002, "epoch": 3.093014568546881, "step": 4140}, {"loss": 1.5361, "grad_norm": 0.5847280025482178, "learning_rate": 0.0002, "epoch": 3.1004856182293614, "step": 4150}, {"loss": 1.5151, "grad_norm": 0.5401515960693359, "learning_rate": 0.0002, "epoch": 3.1079566679118416, "step": 4160}, {"loss": 1.502, "grad_norm": 0.6501587629318237, "learning_rate": 0.0002, "epoch": 3.115427717594322, "step": 4170}, {"loss": 1.4952, "grad_norm": 0.5988039374351501, "learning_rate": 0.0002, "epoch": 3.1228987672768023, "step": 4180}, {"loss": 1.5287, "grad_norm": 0.4982665181159973, "learning_rate": 0.0002, "epoch": 3.130369816959283, "step": 4190}, {"loss": 1.5078, "grad_norm": 0.5548039078712463, "learning_rate": 0.0002, "epoch": 3.137840866641763, "step": 4200}, {"loss": 1.4904, "grad_norm": 0.5920777320861816, "learning_rate": 0.0002, "epoch": 3.1453119163242436, "step": 4210}, {"loss": 1.442, "grad_norm": 0.6965190172195435, "learning_rate": 0.0002, "epoch": 3.152782966006724, "step": 4220}, {"loss": 1.557, "grad_norm": 0.5196244716644287, "learning_rate": 0.0002, "epoch": 3.1602540156892043, "step": 4230}, {"loss": 1.5706, "grad_norm": 0.6942682266235352, "learning_rate": 0.0002, "epoch": 3.167725065371685, "step": 4240}, {"loss": 1.5407, "grad_norm": 0.5765156149864197, "learning_rate": 0.0002, "epoch": 3.175196115054165, "step": 4250}, {"loss": 1.4963, "grad_norm": 0.5801976919174194, "learning_rate": 0.0002, "epoch": 3.1826671647366456, "step": 4260}, {"loss": 1.4988, "grad_norm": 0.6260752081871033, "learning_rate": 0.0002, "epoch": 3.1901382144191257, "step": 4270}, {"loss": 1.5074, "grad_norm": 0.6610770225524902, "learning_rate": 0.0002, "epoch": 3.1976092641016063, "step": 4280}, {"loss": 1.4657, "grad_norm": 0.5762143135070801, "learning_rate": 0.0002, "epoch": 3.205080313784087, "step": 4290}, {"loss": 1.5181, "grad_norm": 0.5926990509033203, "learning_rate": 0.0002, "epoch": 3.212551363466567, "step": 4300}, {"loss": 1.5492, "grad_norm": 0.7373854517936707, "learning_rate": 0.0002, "epoch": 3.2200224131490476, "step": 4310}, {"loss": 1.4648, "grad_norm": 0.5963311195373535, "learning_rate": 0.0002, "epoch": 3.2274934628315277, "step": 4320}, {"loss": 1.5262, "grad_norm": 0.5754616856575012, "learning_rate": 0.0002, "epoch": 3.2349645125140083, "step": 4330}, {"loss": 1.4767, "grad_norm": 0.6116095781326294, "learning_rate": 0.0002, "epoch": 3.2424355621964884, "step": 4340}, {"loss": 1.5008, "grad_norm": 0.6001536846160889, "learning_rate": 0.0002, "epoch": 3.249906611878969, "step": 4350}, {"loss": 1.5738, "grad_norm": 0.5270227789878845, "learning_rate": 0.0002, "epoch": 3.257377661561449, "step": 4360}, {"loss": 1.5235, "grad_norm": 0.6666602492332458, "learning_rate": 0.0002, "epoch": 3.2648487112439297, "step": 4370}, {"loss": 1.5665, "grad_norm": 0.520310640335083, "learning_rate": 0.0002, "epoch": 3.2723197609264103, "step": 4380}, {"loss": 1.542, "grad_norm": 0.5165975093841553, "learning_rate": 0.0002, "epoch": 3.2797908106088904, "step": 4390}, {"loss": 1.4746, "grad_norm": 0.6080228686332703, "learning_rate": 0.0002, "epoch": 3.287261860291371, "step": 4400}, {"loss": 1.4901, "grad_norm": 0.670122504234314, "learning_rate": 0.0002, "epoch": 3.294732909973851, "step": 4410}, {"loss": 1.4677, "grad_norm": 0.6019457578659058, "learning_rate": 0.0002, "epoch": 3.3022039596563317, "step": 4420}, {"loss": 1.4249, "grad_norm": 0.5519300103187561, "learning_rate": 0.0002, "epoch": 3.309675009338812, "step": 4430}, {"loss": 1.555, "grad_norm": 0.5958521962165833, "learning_rate": 0.0002, "epoch": 3.3171460590212924, "step": 4440}, {"loss": 1.5067, "grad_norm": 0.5552705526351929, "learning_rate": 0.0002, "epoch": 3.324617108703773, "step": 4450}, {"loss": 1.5926, "grad_norm": 0.6583784818649292, "learning_rate": 0.0002, "epoch": 3.332088158386253, "step": 4460}, {"loss": 1.4206, "grad_norm": 0.5815939903259277, "learning_rate": 0.0002, "epoch": 3.3395592080687337, "step": 4470}, {"loss": 1.5942, "grad_norm": 1.3342205286026, "learning_rate": 0.0002, "epoch": 3.347030257751214, "step": 4480}, {"loss": 1.484, "grad_norm": 0.6341500878334045, "learning_rate": 0.0002, "epoch": 3.3545013074336945, "step": 4490}, {"loss": 1.5219, "grad_norm": 0.6384079456329346, "learning_rate": 0.0002, "epoch": 3.3619723571161746, "step": 4500}, {"loss": 1.5222, "grad_norm": 0.6098346710205078, "learning_rate": 0.0002, "epoch": 3.369443406798655, "step": 4510}, {"loss": 1.5475, "grad_norm": 0.5958296656608582, "learning_rate": 0.0002, "epoch": 3.3769144564811358, "step": 4520}, {"loss": 1.5171, "grad_norm": 0.6157881617546082, "learning_rate": 0.0002, "epoch": 3.384385506163616, "step": 4530}, {"loss": 1.569, "grad_norm": 0.5671007037162781, "learning_rate": 0.0002, "epoch": 3.3918565558460965, "step": 4540}, {"loss": 1.604, "grad_norm": 0.6203294992446899, "learning_rate": 0.0002, "epoch": 3.3993276055285766, "step": 4550}, {"loss": 1.5364, "grad_norm": 0.6743317246437073, "learning_rate": 0.0002, "epoch": 3.406798655211057, "step": 4560}, {"loss": 1.5034, "grad_norm": 0.731765627861023, "learning_rate": 0.0002, "epoch": 3.4142697048935373, "step": 4570}, {"loss": 1.4585, "grad_norm": 0.6285187602043152, "learning_rate": 0.0002, "epoch": 3.421740754576018, "step": 4580}, {"loss": 1.5296, "grad_norm": 0.612680196762085, "learning_rate": 0.0002, "epoch": 3.4292118042584985, "step": 4590}, {"loss": 1.5577, "grad_norm": 0.6413681507110596, "learning_rate": 0.0002, "epoch": 3.4366828539409786, "step": 4600}, {"loss": 1.5026, "grad_norm": 0.6240990161895752, "learning_rate": 0.0002, "epoch": 3.444153903623459, "step": 4610}, {"loss": 1.5887, "grad_norm": 0.5095735192298889, "learning_rate": 0.0002, "epoch": 3.4516249533059393, "step": 4620}, {"loss": 1.4906, "grad_norm": 0.5699611902236938, "learning_rate": 0.0002, "epoch": 3.45909600298842, "step": 4630}, {"loss": 1.5176, "grad_norm": 0.7289775609970093, "learning_rate": 0.0002, "epoch": 3.4665670526709, "step": 4640}, {"loss": 1.5467, "grad_norm": 0.6211609840393066, "learning_rate": 0.0002, "epoch": 3.4740381023533806, "step": 4650}, {"loss": 1.533, "grad_norm": 0.5714802145957947, "learning_rate": 0.0002, "epoch": 3.481509152035861, "step": 4660}, {"loss": 1.5096, "grad_norm": 0.6287049651145935, "learning_rate": 0.0002, "epoch": 3.4889802017183413, "step": 4670}, {"loss": 1.4212, "grad_norm": 0.5480595827102661, "learning_rate": 0.0002, "epoch": 3.496451251400822, "step": 4680}, {"loss": 1.4746, "grad_norm": 0.5683253407478333, "learning_rate": 0.0002, "epoch": 3.503922301083302, "step": 4690}, {"loss": 1.5012, "grad_norm": 0.601140558719635, "learning_rate": 0.0002, "epoch": 3.5113933507657826, "step": 4700}, {"loss": 1.5383, "grad_norm": 0.5344498157501221, "learning_rate": 0.0002, "epoch": 3.5188644004482628, "step": 4710}, {"loss": 1.5428, "grad_norm": 0.5739690661430359, "learning_rate": 0.0002, "epoch": 3.5263354501307433, "step": 4720}, {"loss": 1.5589, "grad_norm": 0.5640085935592651, "learning_rate": 0.0002, "epoch": 3.533806499813224, "step": 4730}, {"loss": 1.487, "grad_norm": 0.5967805981636047, "learning_rate": 0.0002, "epoch": 3.541277549495704, "step": 4740}, {"loss": 1.5461, "grad_norm": 0.6138835549354553, "learning_rate": 0.0002, "epoch": 3.5487485991781846, "step": 4750}, {"loss": 1.5502, "grad_norm": 0.6779900193214417, "learning_rate": 0.0002, "epoch": 3.5562196488606648, "step": 4760}, {"loss": 1.4917, "grad_norm": 0.6122010350227356, "learning_rate": 0.0002, "epoch": 3.5636906985431454, "step": 4770}, {"loss": 1.5405, "grad_norm": 0.5685241222381592, "learning_rate": 0.0002, "epoch": 3.5711617482256255, "step": 4780}, {"loss": 1.5427, "grad_norm": 0.604583203792572, "learning_rate": 0.0002, "epoch": 3.578632797908106, "step": 4790}, {"loss": 1.4514, "grad_norm": 0.651165246963501, "learning_rate": 0.0002, "epoch": 3.5861038475905866, "step": 4800}, {"loss": 1.4109, "grad_norm": 0.6398511528968811, "learning_rate": 0.0002, "epoch": 3.593574897273067, "step": 4810}, {"loss": 1.4261, "grad_norm": 0.6444641351699829, "learning_rate": 0.0002, "epoch": 3.6010459469555474, "step": 4820}, {"loss": 1.5274, "grad_norm": 0.6018481850624084, "learning_rate": 0.0002, "epoch": 3.6085169966380275, "step": 4830}, {"loss": 1.4647, "grad_norm": 0.6025291085243225, "learning_rate": 0.0002, "epoch": 3.615988046320508, "step": 4840}, {"loss": 1.5609, "grad_norm": 0.6810156106948853, "learning_rate": 0.0002, "epoch": 3.623459096002988, "step": 4850}, {"loss": 1.5299, "grad_norm": 0.6408044695854187, "learning_rate": 0.0002, "epoch": 3.630930145685469, "step": 4860}, {"loss": 1.5366, "grad_norm": 0.5608272552490234, "learning_rate": 0.0002, "epoch": 3.6384011953679494, "step": 4870}, {"loss": 1.5188, "grad_norm": 0.6136814951896667, "learning_rate": 0.0002, "epoch": 3.6458722450504295, "step": 4880}, {"loss": 1.5021, "grad_norm": 0.5927900075912476, "learning_rate": 0.0002, "epoch": 3.65334329473291, "step": 4890}, {"loss": 1.6084, "grad_norm": 0.5336901545524597, "learning_rate": 0.0002, "epoch": 3.66081434441539, "step": 4900}, {"loss": 1.5701, "grad_norm": 0.7823320627212524, "learning_rate": 0.0002, "epoch": 3.668285394097871, "step": 4910}, {"loss": 1.4881, "grad_norm": 0.6703504323959351, "learning_rate": 0.0002, "epoch": 3.675756443780351, "step": 4920}, {"loss": 1.5332, "grad_norm": 0.6061160564422607, "learning_rate": 0.0002, "epoch": 3.6832274934628315, "step": 4930}, {"loss": 1.5405, "grad_norm": 0.6237227916717529, "learning_rate": 0.0002, "epoch": 3.690698543145312, "step": 4940}, {"loss": 1.497, "grad_norm": 0.5985278487205505, "learning_rate": 0.0002, "epoch": 3.6981695928277922, "step": 4950}, {"loss": 1.5132, "grad_norm": 0.6483839750289917, "learning_rate": 0.0002, "epoch": 3.705640642510273, "step": 4960}, {"loss": 1.5338, "grad_norm": 0.5788805484771729, "learning_rate": 0.0002, "epoch": 3.713111692192753, "step": 4970}, {"loss": 1.5258, "grad_norm": 0.5609974265098572, "learning_rate": 0.0002, "epoch": 3.7205827418752335, "step": 4980}, {"loss": 1.4759, "grad_norm": 0.5681300759315491, "learning_rate": 0.0002, "epoch": 3.7280537915577137, "step": 4990}, {"loss": 1.6018, "grad_norm": 0.5860186219215393, "learning_rate": 0.0002, "epoch": 3.7355248412401942, "step": 5000}, {"loss": 1.58, "grad_norm": 0.5718157291412354, "learning_rate": 0.0002, "epoch": 3.742995890922675, "step": 5010}, {"loss": 1.5834, "grad_norm": 0.6173721551895142, "learning_rate": 0.0002, "epoch": 3.750466940605155, "step": 5020}, {"loss": 1.5617, "grad_norm": 0.629152238368988, "learning_rate": 0.0002, "epoch": 3.7579379902876355, "step": 5030}, {"loss": 1.519, "grad_norm": 0.5666284561157227, "learning_rate": 0.0002, "epoch": 3.7654090399701157, "step": 5040}, {"loss": 1.5329, "grad_norm": 0.6053005456924438, "learning_rate": 0.0002, "epoch": 3.7728800896525962, "step": 5050}, {"loss": 1.5404, "grad_norm": 0.5870583057403564, "learning_rate": 0.0002, "epoch": 3.7803511393350764, "step": 5060}, {"loss": 1.4444, "grad_norm": 0.5422009229660034, "learning_rate": 0.0002, "epoch": 3.787822189017557, "step": 5070}, {"loss": 1.5308, "grad_norm": 0.5396918058395386, "learning_rate": 0.0002, "epoch": 3.7952932387000375, "step": 5080}, {"loss": 1.464, "grad_norm": 0.5544713139533997, "learning_rate": 0.0002, "epoch": 3.8027642883825177, "step": 5090}, {"loss": 1.4752, "grad_norm": 0.5983749628067017, "learning_rate": 0.0002, "epoch": 3.8102353380649983, "step": 5100}, {"loss": 1.4972, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 3.8177063877474784, "step": 5110}, {"loss": 1.5471, "grad_norm": 0.5436882376670837, "learning_rate": 0.0002, "epoch": 3.825177437429959, "step": 5120}, {"loss": 1.5118, "grad_norm": 0.5453617572784424, "learning_rate": 0.0002, "epoch": 3.832648487112439, "step": 5130}, {"loss": 1.5732, "grad_norm": 0.6269069314002991, "learning_rate": 0.0002, "epoch": 3.8401195367949197, "step": 5140}, {"loss": 1.4959, "grad_norm": 0.6189185380935669, "learning_rate": 0.0002, "epoch": 3.8475905864774003, "step": 5150}, {"loss": 1.4999, "grad_norm": 0.6653388142585754, "learning_rate": 0.0002, "epoch": 3.8550616361598804, "step": 5160}, {"loss": 1.5075, "grad_norm": 0.5771768689155579, "learning_rate": 0.0002, "epoch": 3.862532685842361, "step": 5170}, {"loss": 1.5545, "grad_norm": 0.6052790880203247, "learning_rate": 0.0002, "epoch": 3.870003735524841, "step": 5180}, {"loss": 1.4987, "grad_norm": 0.6572316884994507, "learning_rate": 0.0002, "epoch": 3.8774747852073217, "step": 5190}, {"loss": 1.5241, "grad_norm": 0.670576810836792, "learning_rate": 0.0002, "epoch": 3.884945834889802, "step": 5200}, {"loss": 1.4777, "grad_norm": 0.5728798508644104, "learning_rate": 0.0002, "epoch": 3.8924168845722824, "step": 5210}, {"loss": 1.5351, "grad_norm": 0.6340774297714233, "learning_rate": 0.0002, "epoch": 3.899887934254763, "step": 5220}, {"loss": 1.5081, "grad_norm": 0.5981315970420837, "learning_rate": 0.0002, "epoch": 3.907358983937243, "step": 5230}, {"loss": 1.4875, "grad_norm": 0.6212025880813599, "learning_rate": 0.0002, "epoch": 3.9148300336197237, "step": 5240}, {"loss": 1.5545, "grad_norm": 0.6202296018600464, "learning_rate": 0.0002, "epoch": 3.922301083302204, "step": 5250}, {"loss": 1.5765, "grad_norm": 0.6159142255783081, "learning_rate": 0.0002, "epoch": 3.9297721329846844, "step": 5260}, {"loss": 1.4938, "grad_norm": 0.6519438624382019, "learning_rate": 0.0002, "epoch": 3.9372431826671646, "step": 5270}, {"loss": 1.4859, "grad_norm": 0.539813756942749, "learning_rate": 0.0002, "epoch": 3.944714232349645, "step": 5280}, {"loss": 1.5921, "grad_norm": 0.6443665027618408, "learning_rate": 0.0002, "epoch": 3.9521852820321257, "step": 5290}, {"loss": 1.5153, "grad_norm": 0.6635757684707642, "learning_rate": 0.0002, "epoch": 3.959656331714606, "step": 5300}, {"loss": 1.5485, "grad_norm": 0.589363157749176, "learning_rate": 0.0002, "epoch": 3.9671273813970864, "step": 5310}, {"loss": 1.5498, "grad_norm": 0.5788735747337341, "learning_rate": 0.0002, "epoch": 3.9745984310795666, "step": 5320}, {"loss": 1.5607, "grad_norm": 0.5976864695549011, "learning_rate": 0.0002, "epoch": 3.982069480762047, "step": 5330}, {"loss": 1.5302, "grad_norm": 0.6624067425727844, "learning_rate": 0.0002, "epoch": 3.9895405304445273, "step": 5340}, {"loss": 1.5904, "grad_norm": 0.6738956570625305, "learning_rate": 0.0002, "epoch": 3.997011580127008, "step": 5350}, {"eval_loss": 1.868006944656372, "eval_runtime": 38.5153, "eval_samples_per_second": 13.371, "eval_steps_per_second": 1.688, "epoch": 4.0, "step": 5354}, {"loss": 1.4535, "grad_norm": 0.6023468971252441, "learning_rate": 0.0002, "epoch": 4.004482629809488, "step": 5360}, {"loss": 1.3987, "grad_norm": 0.8589285612106323, "learning_rate": 0.0002, "epoch": 4.011953679491969, "step": 5370}, {"loss": 1.3952, "grad_norm": 0.7477491497993469, "learning_rate": 0.0002, "epoch": 4.019424729174449, "step": 5380}, {"loss": 1.3745, "grad_norm": 0.7601922154426575, "learning_rate": 0.0002, "epoch": 4.02689577885693, "step": 5390}, {"loss": 1.4133, "grad_norm": 0.8115614056587219, "learning_rate": 0.0002, "epoch": 4.03436682853941, "step": 5400}, {"loss": 1.3748, "grad_norm": 0.669925332069397, "learning_rate": 0.0002, "epoch": 4.04183787822189, "step": 5410}, {"loss": 1.2835, "grad_norm": 0.8091904520988464, "learning_rate": 0.0002, "epoch": 4.04930892790437, "step": 5420}, {"loss": 1.3615, "grad_norm": 0.709405779838562, "learning_rate": 0.0002, "epoch": 4.056779977586851, "step": 5430}, {"loss": 1.3558, "grad_norm": 1.0006179809570312, "learning_rate": 0.0002, "epoch": 4.064251027269331, "step": 5440}, {"loss": 1.3491, "grad_norm": 0.7017965912818909, "learning_rate": 0.0002, "epoch": 4.071722076951811, "step": 5450}, {"loss": 1.3642, "grad_norm": 0.8991572260856628, "learning_rate": 0.0002, "epoch": 4.0791931266342925, "step": 5460}, {"loss": 1.392, "grad_norm": 0.9064797759056091, "learning_rate": 0.0002, "epoch": 4.086664176316773, "step": 5470}, {"loss": 1.3425, "grad_norm": 0.7981749176979065, "learning_rate": 0.0002, "epoch": 4.094135225999253, "step": 5480}, {"loss": 1.3826, "grad_norm": 0.7280883193016052, "learning_rate": 0.0002, "epoch": 4.101606275681733, "step": 5490}, {"loss": 1.3275, "grad_norm": 0.7419600486755371, "learning_rate": 0.0002, "epoch": 4.109077325364214, "step": 5500}, {"loss": 1.3199, "grad_norm": 0.8019949197769165, "learning_rate": 0.0002, "epoch": 4.116548375046694, "step": 5510}, {"loss": 1.3133, "grad_norm": 0.7501229047775269, "learning_rate": 0.0002, "epoch": 4.124019424729174, "step": 5520}, {"loss": 1.4432, "grad_norm": 0.8166249990463257, "learning_rate": 0.0002, "epoch": 4.131490474411655, "step": 5530}, {"loss": 1.3901, "grad_norm": 0.9728496074676514, "learning_rate": 0.0002, "epoch": 4.138961524094135, "step": 5540}, {"loss": 1.3538, "grad_norm": 0.7590922117233276, "learning_rate": 0.0002, "epoch": 4.1464325737766154, "step": 5550}, {"loss": 1.4368, "grad_norm": 0.7759010791778564, "learning_rate": 0.0002, "epoch": 4.153903623459096, "step": 5560}, {"loss": 1.3635, "grad_norm": 0.9057986736297607, "learning_rate": 0.0002, "epoch": 4.161374673141577, "step": 5570}, {"loss": 1.4152, "grad_norm": 0.8853937983512878, "learning_rate": 0.0002, "epoch": 4.168845722824057, "step": 5580}, {"loss": 1.3633, "grad_norm": 0.7070684432983398, "learning_rate": 0.0002, "epoch": 4.176316772506537, "step": 5590}, {"loss": 1.3218, "grad_norm": 0.7649410963058472, "learning_rate": 0.0002, "epoch": 4.183787822189018, "step": 5600}, {"loss": 1.3857, "grad_norm": 1.2048029899597168, "learning_rate": 0.0002, "epoch": 4.191258871871498, "step": 5610}, {"loss": 1.3629, "grad_norm": 0.7986605763435364, "learning_rate": 0.0002, "epoch": 4.198729921553978, "step": 5620}, {"loss": 1.3995, "grad_norm": 0.8151885867118835, "learning_rate": 0.0002, "epoch": 4.206200971236458, "step": 5630}, {"loss": 1.3782, "grad_norm": 0.7719064354896545, "learning_rate": 0.0002, "epoch": 4.213672020918939, "step": 5640}, {"loss": 1.3852, "grad_norm": 0.8422448039054871, "learning_rate": 0.0002, "epoch": 4.2211430706014195, "step": 5650}, {"loss": 1.3321, "grad_norm": 0.7017164826393127, "learning_rate": 0.0002, "epoch": 4.2286141202839, "step": 5660}, {"loss": 1.4105, "grad_norm": 0.8559677600860596, "learning_rate": 0.0002, "epoch": 4.236085169966381, "step": 5670}, {"loss": 1.3701, "grad_norm": 0.8216157555580139, "learning_rate": 0.0002, "epoch": 4.243556219648861, "step": 5680}, {"loss": 1.3565, "grad_norm": 0.7681755423545837, "learning_rate": 0.0002, "epoch": 4.251027269331341, "step": 5690}, {"loss": 1.3806, "grad_norm": 0.811665952205658, "learning_rate": 0.0002, "epoch": 4.258498319013821, "step": 5700}, {"loss": 1.4161, "grad_norm": 0.7242204546928406, "learning_rate": 0.0002, "epoch": 4.265969368696302, "step": 5710}, {"loss": 1.2958, "grad_norm": 0.7570181488990784, "learning_rate": 0.0002, "epoch": 4.273440418378782, "step": 5720}, {"loss": 1.4265, "grad_norm": 0.8951969146728516, "learning_rate": 0.0002, "epoch": 4.280911468061262, "step": 5730}, {"loss": 1.3895, "grad_norm": 0.7222902178764343, "learning_rate": 0.0002, "epoch": 4.288382517743743, "step": 5740}, {"loss": 1.4155, "grad_norm": 0.8508469462394714, "learning_rate": 0.0002, "epoch": 4.2958535674262235, "step": 5750}, {"loss": 1.365, "grad_norm": 0.7215430736541748, "learning_rate": 0.0002, "epoch": 4.303324617108704, "step": 5760}, {"loss": 1.4472, "grad_norm": 0.8774884939193726, "learning_rate": 0.0002, "epoch": 4.310795666791184, "step": 5770}, {"loss": 1.427, "grad_norm": 0.8354552984237671, "learning_rate": 0.0002, "epoch": 4.318266716473665, "step": 5780}, {"loss": 1.3222, "grad_norm": 0.6938814520835876, "learning_rate": 0.0002, "epoch": 4.325737766156145, "step": 5790}, {"loss": 1.3589, "grad_norm": 0.78675377368927, "learning_rate": 0.0002, "epoch": 4.333208815838625, "step": 5800}, {"loss": 1.3662, "grad_norm": 0.7147697806358337, "learning_rate": 0.0002, "epoch": 4.340679865521106, "step": 5810}, {"loss": 1.3597, "grad_norm": 0.7693623304367065, "learning_rate": 0.0002, "epoch": 4.348150915203586, "step": 5820}, {"loss": 1.2944, "grad_norm": 0.856517493724823, "learning_rate": 0.0002, "epoch": 4.355621964886066, "step": 5830}, {"loss": 1.4307, "grad_norm": 0.7200973033905029, "learning_rate": 0.0002, "epoch": 4.3630930145685465, "step": 5840}, {"loss": 1.442, "grad_norm": 0.743281364440918, "learning_rate": 0.0002, "epoch": 4.3705640642510275, "step": 5850}, {"loss": 1.3999, "grad_norm": 0.7627727389335632, "learning_rate": 0.0002, "epoch": 4.378035113933508, "step": 5860}, {"loss": 1.4082, "grad_norm": 0.7238836884498596, "learning_rate": 0.0002, "epoch": 4.385506163615988, "step": 5870}, {"loss": 1.4292, "grad_norm": 0.7253410816192627, "learning_rate": 0.0002, "epoch": 4.392977213298469, "step": 5880}, {"loss": 1.3774, "grad_norm": 0.8232238292694092, "learning_rate": 0.0002, "epoch": 4.400448262980949, "step": 5890}, {"loss": 1.3757, "grad_norm": 0.8778504729270935, "learning_rate": 0.0002, "epoch": 4.407919312663429, "step": 5900}, {"loss": 1.387, "grad_norm": 0.7639474868774414, "learning_rate": 0.0002, "epoch": 4.415390362345909, "step": 5910}, {"loss": 1.3862, "grad_norm": 0.7666519284248352, "learning_rate": 0.0002, "epoch": 4.42286141202839, "step": 5920}, {"loss": 1.4168, "grad_norm": 0.867132842540741, "learning_rate": 0.0002, "epoch": 4.43033246171087, "step": 5930}, {"loss": 1.4772, "grad_norm": 0.7571166753768921, "learning_rate": 0.0002, "epoch": 4.4378035113933505, "step": 5940}, {"loss": 1.4401, "grad_norm": 0.7911370992660522, "learning_rate": 0.0002, "epoch": 4.4452745610758315, "step": 5950}, {"loss": 1.4516, "grad_norm": 0.8844250440597534, "learning_rate": 0.0002, "epoch": 4.452745610758312, "step": 5960}, {"loss": 1.4109, "grad_norm": 0.7336231470108032, "learning_rate": 0.0002, "epoch": 4.460216660440792, "step": 5970}, {"loss": 1.3891, "grad_norm": 0.8162738084793091, "learning_rate": 0.0002, "epoch": 4.467687710123272, "step": 5980}, {"loss": 1.393, "grad_norm": 0.7413017153739929, "learning_rate": 0.0002, "epoch": 4.475158759805753, "step": 5990}, {"loss": 1.3712, "grad_norm": 0.7215432524681091, "learning_rate": 0.0002, "epoch": 4.482629809488233, "step": 6000}, {"loss": 1.3521, "grad_norm": 0.8943389058113098, "learning_rate": 0.0002, "epoch": 4.490100859170713, "step": 6010}, {"loss": 1.4172, "grad_norm": 0.7850823998451233, "learning_rate": 0.0002, "epoch": 4.497571908853194, "step": 6020}, {"loss": 1.3582, "grad_norm": 0.8117504119873047, "learning_rate": 0.0002, "epoch": 4.505042958535674, "step": 6030}, {"loss": 1.4272, "grad_norm": 0.8381605744361877, "learning_rate": 0.0002, "epoch": 4.5125140082181545, "step": 6040}, {"loss": 1.3829, "grad_norm": 0.7964059710502625, "learning_rate": 0.0002, "epoch": 4.519985057900635, "step": 6050}, {"loss": 1.3555, "grad_norm": 0.7935128211975098, "learning_rate": 0.0002, "epoch": 4.527456107583116, "step": 6060}, {"loss": 1.3994, "grad_norm": 0.8725124597549438, "learning_rate": 0.0002, "epoch": 4.534927157265596, "step": 6070}, {"loss": 1.3923, "grad_norm": 0.880325198173523, "learning_rate": 0.0002, "epoch": 4.542398206948076, "step": 6080}, {"loss": 1.4459, "grad_norm": 0.7220637202262878, "learning_rate": 0.0002, "epoch": 4.549869256630557, "step": 6090}, {"loss": 1.3281, "grad_norm": 0.6908547878265381, "learning_rate": 0.0002, "epoch": 4.557340306313037, "step": 6100}, {"loss": 1.437, "grad_norm": 0.797931969165802, "learning_rate": 0.0002, "epoch": 4.564811355995517, "step": 6110}, {"loss": 1.4023, "grad_norm": 0.7056134343147278, "learning_rate": 0.0002, "epoch": 4.572282405677997, "step": 6120}, {"loss": 1.3814, "grad_norm": 0.7850478887557983, "learning_rate": 0.0002, "epoch": 4.579753455360478, "step": 6130}, {"loss": 1.3579, "grad_norm": 0.8112621307373047, "learning_rate": 0.0002, "epoch": 4.5872245050429585, "step": 6140}, {"loss": 1.3523, "grad_norm": 0.7040849328041077, "learning_rate": 0.0002, "epoch": 4.594695554725439, "step": 6150}, {"loss": 1.3526, "grad_norm": 0.7214553952217102, "learning_rate": 0.0002, "epoch": 4.60216660440792, "step": 6160}, {"loss": 1.3932, "grad_norm": 0.8616511821746826, "learning_rate": 0.0002, "epoch": 4.6096376540904, "step": 6170}, {"loss": 1.4622, "grad_norm": 0.8374658226966858, "learning_rate": 0.0002, "epoch": 4.61710870377288, "step": 6180}, {"loss": 1.3703, "grad_norm": 0.6761606931686401, "learning_rate": 0.0002, "epoch": 4.62457975345536, "step": 6190}, {"loss": 1.3977, "grad_norm": 0.768028199672699, "learning_rate": 0.0002, "epoch": 4.632050803137841, "step": 6200}, {"loss": 1.3772, "grad_norm": 0.9372717142105103, "learning_rate": 0.0002, "epoch": 4.639521852820321, "step": 6210}, {"loss": 1.4098, "grad_norm": 0.7906546592712402, "learning_rate": 0.0002, "epoch": 4.646992902502801, "step": 6220}, {"loss": 1.3962, "grad_norm": 0.7376723289489746, "learning_rate": 0.0002, "epoch": 4.654463952185282, "step": 6230}, {"loss": 1.4529, "grad_norm": 0.8972630500793457, "learning_rate": 0.0002, "epoch": 4.6619350018677626, "step": 6240}, {"loss": 1.4668, "grad_norm": 0.8261756300926208, "learning_rate": 0.0002, "epoch": 4.669406051550243, "step": 6250}, {"loss": 1.3267, "grad_norm": 0.7512393593788147, "learning_rate": 0.0002, "epoch": 4.676877101232723, "step": 6260}, {"loss": 1.4278, "grad_norm": 0.7132362127304077, "learning_rate": 0.0002, "epoch": 4.684348150915204, "step": 6270}, {"loss": 1.4299, "grad_norm": 0.7690575122833252, "learning_rate": 0.0002, "epoch": 4.691819200597684, "step": 6280}, {"loss": 1.4769, "grad_norm": 0.9886258840560913, "learning_rate": 0.0002, "epoch": 4.699290250280164, "step": 6290}, {"loss": 1.4005, "grad_norm": 0.9502435922622681, "learning_rate": 0.0002, "epoch": 4.706761299962645, "step": 6300}, {"loss": 1.4319, "grad_norm": 0.702255129814148, "learning_rate": 0.0002, "epoch": 4.714232349645125, "step": 6310}, {"loss": 1.4447, "grad_norm": 0.7713103890419006, "learning_rate": 0.0002, "epoch": 4.721703399327605, "step": 6320}, {"loss": 1.4392, "grad_norm": 0.7778580784797668, "learning_rate": 0.0002, "epoch": 4.7291744490100855, "step": 6330}, {"loss": 1.4169, "grad_norm": 0.7275111079216003, "learning_rate": 0.0002, "epoch": 4.736645498692567, "step": 6340}, {"loss": 1.4429, "grad_norm": 0.7728744149208069, "learning_rate": 0.0002, "epoch": 4.744116548375047, "step": 6350}, {"loss": 1.3756, "grad_norm": 0.9724260568618774, "learning_rate": 0.0002, "epoch": 4.751587598057527, "step": 6360}, {"loss": 1.3358, "grad_norm": 0.7505622506141663, "learning_rate": 0.0002, "epoch": 4.759058647740007, "step": 6370}, {"loss": 1.379, "grad_norm": 0.7994682788848877, "learning_rate": 0.0002, "epoch": 4.766529697422488, "step": 6380}, {"loss": 1.4275, "grad_norm": 0.8432038426399231, "learning_rate": 0.0002, "epoch": 4.774000747104968, "step": 6390}, {"loss": 1.4606, "grad_norm": 0.7436022758483887, "learning_rate": 0.0002, "epoch": 4.781471796787448, "step": 6400}, {"loss": 1.3461, "grad_norm": 0.7709194421768188, "learning_rate": 0.0002, "epoch": 4.788942846469929, "step": 6410}, {"loss": 1.3715, "grad_norm": 0.8798436522483826, "learning_rate": 0.0002, "epoch": 4.796413896152409, "step": 6420}, {"loss": 1.3761, "grad_norm": 0.790189266204834, "learning_rate": 0.0002, "epoch": 4.80388494583489, "step": 6430}, {"loss": 1.4109, "grad_norm": 0.6824303865432739, "learning_rate": 0.0002, "epoch": 4.811355995517371, "step": 6440}, {"loss": 1.3877, "grad_norm": 0.7501044869422913, "learning_rate": 0.0002, "epoch": 4.818827045199851, "step": 6450}, {"loss": 1.4458, "grad_norm": 0.8840398192405701, "learning_rate": 0.0002, "epoch": 4.826298094882331, "step": 6460}, {"loss": 1.4412, "grad_norm": 0.7812688946723938, "learning_rate": 0.0002, "epoch": 4.833769144564811, "step": 6470}, {"loss": 1.4299, "grad_norm": 0.7429926991462708, "learning_rate": 0.0002, "epoch": 4.841240194247292, "step": 6480}, {"loss": 1.5062, "grad_norm": 0.7778021693229675, "learning_rate": 0.0002, "epoch": 4.848711243929772, "step": 6490}, {"loss": 1.4589, "grad_norm": 0.8270702362060547, "learning_rate": 0.0002, "epoch": 4.856182293612252, "step": 6500}, {"loss": 1.4091, "grad_norm": 0.6960513591766357, "learning_rate": 0.0002, "epoch": 4.863653343294732, "step": 6510}, {"loss": 1.376, "grad_norm": 0.7728942632675171, "learning_rate": 0.0002, "epoch": 4.8711243929772134, "step": 6520}, {"loss": 1.4852, "grad_norm": 0.7377303838729858, "learning_rate": 0.0002, "epoch": 4.878595442659694, "step": 6530}, {"loss": 1.3846, "grad_norm": 0.7257253527641296, "learning_rate": 0.0002, "epoch": 4.886066492342174, "step": 6540}, {"loss": 1.4166, "grad_norm": 0.7875821590423584, "learning_rate": 0.0002, "epoch": 4.893537542024655, "step": 6550}, {"loss": 1.357, "grad_norm": 0.8346304297447205, "learning_rate": 0.0002, "epoch": 4.901008591707135, "step": 6560}, {"loss": 1.4522, "grad_norm": 0.7710739374160767, "learning_rate": 0.0002, "epoch": 4.908479641389615, "step": 6570}, {"loss": 1.4465, "grad_norm": 0.7015138268470764, "learning_rate": 0.0002, "epoch": 4.915950691072096, "step": 6580}, {"loss": 1.435, "grad_norm": 0.8707432150840759, "learning_rate": 0.0002, "epoch": 4.923421740754576, "step": 6590}, {"loss": 1.2968, "grad_norm": 0.786601185798645, "learning_rate": 0.0002, "epoch": 4.930892790437056, "step": 6600}, {"loss": 1.4385, "grad_norm": 0.978519082069397, "learning_rate": 0.0002, "epoch": 4.938363840119536, "step": 6610}, {"loss": 1.3997, "grad_norm": 0.8102927207946777, "learning_rate": 0.0002, "epoch": 4.9458348898020175, "step": 6620}, {"loss": 1.4859, "grad_norm": 0.7628704309463501, "learning_rate": 0.0002, "epoch": 4.953305939484498, "step": 6630}, {"loss": 1.3774, "grad_norm": 0.8053455352783203, "learning_rate": 0.0002, "epoch": 4.960776989166978, "step": 6640}, {"loss": 1.5092, "grad_norm": 0.8680412173271179, "learning_rate": 0.0002, "epoch": 4.968248038849458, "step": 6650}, {"loss": 1.3978, "grad_norm": 0.7415758371353149, "learning_rate": 0.0002, "epoch": 4.975719088531939, "step": 6660}, {"loss": 1.3793, "grad_norm": 0.7730312347412109, "learning_rate": 0.0002, "epoch": 4.983190138214419, "step": 6670}, {"loss": 1.4863, "grad_norm": 0.7924041152000427, "learning_rate": 0.0002, "epoch": 4.990661187896899, "step": 6680}, {"loss": 1.4137, "grad_norm": 0.8677893877029419, "learning_rate": 0.0002, "epoch": 4.99813223757938, "step": 6690}, {"eval_loss": 1.9444633722305298, "eval_runtime": 39.3488, "eval_samples_per_second": 13.088, "eval_steps_per_second": 1.652, "epoch": 4.999626447515876, "step": 6692}, {"loss": 1.3076, "grad_norm": 0.7102245092391968, "learning_rate": 0.0002, "epoch": 5.00560328726186, "step": 6700}, {"loss": 1.2714, "grad_norm": 1.0425463914871216, "learning_rate": 0.0002, "epoch": 5.0130743369443405, "step": 6710}, {"loss": 1.181, "grad_norm": 0.9320756793022156, "learning_rate": 0.0002, "epoch": 5.0205453866268215, "step": 6720}, {"loss": 1.1786, "grad_norm": 0.8797217607498169, "learning_rate": 0.0002, "epoch": 5.028016436309302, "step": 6730}, {"loss": 1.2097, "grad_norm": 2.135707139968872, "learning_rate": 0.0002, "epoch": 5.035487485991782, "step": 6740}, {"loss": 1.1761, "grad_norm": 0.8747734427452087, "learning_rate": 0.0002, "epoch": 5.042958535674262, "step": 6750}, {"loss": 1.1675, "grad_norm": 0.9981076717376709, "learning_rate": 0.0002, "epoch": 5.050429585356743, "step": 6760}, {"loss": 1.1976, "grad_norm": 0.985078752040863, "learning_rate": 0.0002, "epoch": 5.057900635039223, "step": 6770}, {"loss": 1.2688, "grad_norm": 1.0974019765853882, "learning_rate": 0.0002, "epoch": 5.065371684721703, "step": 6780}, {"loss": 1.1982, "grad_norm": 0.9823219180107117, "learning_rate": 0.0002, "epoch": 5.072842734404184, "step": 6790}, {"loss": 1.2586, "grad_norm": 1.122605562210083, "learning_rate": 0.0002, "epoch": 5.080313784086664, "step": 6800}, {"loss": 1.2069, "grad_norm": 0.8556802272796631, "learning_rate": 0.0002, "epoch": 5.0877848337691445, "step": 6810}, {"loss": 1.1908, "grad_norm": 1.1699262857437134, "learning_rate": 0.0002, "epoch": 5.095255883451625, "step": 6820}, {"loss": 1.1869, "grad_norm": 1.0440590381622314, "learning_rate": 0.0002, "epoch": 5.102726933134106, "step": 6830}, {"loss": 1.1655, "grad_norm": 1.0445300340652466, "learning_rate": 0.0002, "epoch": 5.110197982816586, "step": 6840}, {"loss": 1.2392, "grad_norm": 0.8289563059806824, "learning_rate": 0.0002, "epoch": 5.117669032499066, "step": 6850}, {"loss": 1.1687, "grad_norm": 1.1051193475723267, "learning_rate": 0.0002, "epoch": 5.125140082181547, "step": 6860}, {"loss": 1.2737, "grad_norm": 0.9345614910125732, "learning_rate": 0.0002, "epoch": 5.132611131864027, "step": 6870}, {"loss": 1.3021, "grad_norm": 1.1222996711730957, "learning_rate": 0.0002, "epoch": 5.140082181546507, "step": 6880}, {"loss": 1.2408, "grad_norm": 0.9405338764190674, "learning_rate": 0.0002, "epoch": 5.147553231228987, "step": 6890}, {"loss": 1.2367, "grad_norm": 1.0935171842575073, "learning_rate": 0.0002, "epoch": 5.155024280911468, "step": 6900}, {"loss": 1.2458, "grad_norm": 1.0438612699508667, "learning_rate": 0.0002, "epoch": 5.1624953305939485, "step": 6910}, {"loss": 1.2562, "grad_norm": 1.1189004182815552, "learning_rate": 0.0002, "epoch": 5.169966380276429, "step": 6920}, {"loss": 1.25, "grad_norm": 1.0533215999603271, "learning_rate": 0.0002, "epoch": 5.17743742995891, "step": 6930}, {"loss": 1.2974, "grad_norm": 0.9779648780822754, "learning_rate": 0.0002, "epoch": 5.18490847964139, "step": 6940}, {"loss": 1.1965, "grad_norm": 0.8920868635177612, "learning_rate": 0.0002, "epoch": 5.19237952932387, "step": 6950}, {"loss": 1.283, "grad_norm": 0.8374548554420471, "learning_rate": 0.0002, "epoch": 5.19985057900635, "step": 6960}, {"loss": 1.2775, "grad_norm": 1.0490682125091553, "learning_rate": 0.0002, "epoch": 5.207321628688831, "step": 6970}, {"loss": 1.1826, "grad_norm": 0.9658287167549133, "learning_rate": 0.0002, "epoch": 5.214792678371311, "step": 6980}, {"loss": 1.2647, "grad_norm": 0.9652056097984314, "learning_rate": 0.0002, "epoch": 5.222263728053791, "step": 6990}, {"loss": 1.3023, "grad_norm": 0.9141794443130493, "learning_rate": 0.0002, "epoch": 5.229734777736272, "step": 7000}, {"loss": 1.2456, "grad_norm": 0.9831376671791077, "learning_rate": 0.0002, "epoch": 5.2372058274187525, "step": 7010}, {"loss": 1.2176, "grad_norm": 1.0198718309402466, "learning_rate": 0.0002, "epoch": 5.244676877101233, "step": 7020}, {"loss": 1.2643, "grad_norm": 0.9647888541221619, "learning_rate": 0.0002, "epoch": 5.252147926783713, "step": 7030}, {"loss": 1.2106, "grad_norm": 1.3941649198532104, "learning_rate": 0.0002, "epoch": 5.259618976466194, "step": 7040}, {"loss": 1.2885, "grad_norm": 1.0305466651916504, "learning_rate": 0.0002, "epoch": 5.267090026148674, "step": 7050}, {"loss": 1.2362, "grad_norm": 0.9577859044075012, "learning_rate": 0.0002, "epoch": 5.274561075831154, "step": 7060}, {"loss": 1.2231, "grad_norm": 1.149092197418213, "learning_rate": 0.0002, "epoch": 5.282032125513634, "step": 7070}, {"loss": 1.2986, "grad_norm": 1.2582733631134033, "learning_rate": 0.0002, "epoch": 5.289503175196115, "step": 7080}, {"loss": 1.2307, "grad_norm": 1.1777442693710327, "learning_rate": 0.0002, "epoch": 5.296974224878595, "step": 7090}, {"loss": 1.24, "grad_norm": 1.0076404809951782, "learning_rate": 0.0002, "epoch": 5.3044452745610755, "step": 7100}, {"loss": 1.1407, "grad_norm": 0.9037365913391113, "learning_rate": 0.0002, "epoch": 5.3119163242435565, "step": 7110}, {"loss": 1.238, "grad_norm": 0.9428724646568298, "learning_rate": 0.0002, "epoch": 5.319387373926037, "step": 7120}, {"loss": 1.2571, "grad_norm": 0.9935154318809509, "learning_rate": 0.0002, "epoch": 5.326858423608517, "step": 7130}, {"loss": 1.2833, "grad_norm": 1.087500810623169, "learning_rate": 0.0002, "epoch": 5.334329473290998, "step": 7140}, {"loss": 1.2304, "grad_norm": 0.8543072938919067, "learning_rate": 0.0002, "epoch": 5.341800522973478, "step": 7150}, {"loss": 1.2755, "grad_norm": 0.9323700070381165, "learning_rate": 0.0002, "epoch": 5.349271572655958, "step": 7160}, {"loss": 1.2769, "grad_norm": 1.0037827491760254, "learning_rate": 0.0002, "epoch": 5.356742622338438, "step": 7170}, {"loss": 1.3204, "grad_norm": 0.8746469616889954, "learning_rate": 0.0002, "epoch": 5.364213672020919, "step": 7180}, {"loss": 1.2759, "grad_norm": 0.9516328573226929, "learning_rate": 0.0002, "epoch": 5.371684721703399, "step": 7190}, {"loss": 1.2428, "grad_norm": 0.9395177364349365, "learning_rate": 0.0002, "epoch": 5.3791557713858795, "step": 7200}, {"loss": 1.3214, "grad_norm": 1.000369906425476, "learning_rate": 0.0002, "epoch": 5.38662682106836, "step": 7210}, {"loss": 1.2337, "grad_norm": 1.0845502614974976, "learning_rate": 0.0002, "epoch": 5.394097870750841, "step": 7220}, {"loss": 1.2776, "grad_norm": 0.8975145220756531, "learning_rate": 0.0002, "epoch": 5.401568920433321, "step": 7230}, {"loss": 1.2306, "grad_norm": 1.040077805519104, "learning_rate": 0.0002, "epoch": 5.409039970115801, "step": 7240}, {"loss": 1.2277, "grad_norm": 1.0729942321777344, "learning_rate": 0.0002, "epoch": 5.416511019798282, "step": 7250}, {"loss": 1.2714, "grad_norm": 0.8322232961654663, "learning_rate": 0.0002, "epoch": 5.423982069480762, "step": 7260}, {"loss": 1.3036, "grad_norm": 1.0654641389846802, "learning_rate": 0.0002, "epoch": 5.431453119163242, "step": 7270}, {"loss": 1.268, "grad_norm": 1.0445852279663086, "learning_rate": 0.0002, "epoch": 5.438924168845723, "step": 7280}, {"loss": 1.2743, "grad_norm": 1.0762956142425537, "learning_rate": 0.0002, "epoch": 5.446395218528203, "step": 7290}, {"loss": 1.2887, "grad_norm": 0.9721953868865967, "learning_rate": 0.0002, "epoch": 5.4538662682106835, "step": 7300}, {"loss": 1.2833, "grad_norm": 0.9238539338111877, "learning_rate": 0.0002, "epoch": 5.461337317893164, "step": 7310}, {"loss": 1.255, "grad_norm": 0.9912874102592468, "learning_rate": 0.0002, "epoch": 5.468808367575645, "step": 7320}, {"loss": 1.2557, "grad_norm": 1.0727077722549438, "learning_rate": 0.0002, "epoch": 5.476279417258125, "step": 7330}, {"loss": 1.3471, "grad_norm": 0.8633865118026733, "learning_rate": 0.0002, "epoch": 5.483750466940605, "step": 7340}, {"loss": 1.3155, "grad_norm": 0.9396262764930725, "learning_rate": 0.0002, "epoch": 5.491221516623085, "step": 7350}, {"loss": 1.3146, "grad_norm": 1.0253715515136719, "learning_rate": 0.0002, "epoch": 5.498692566305566, "step": 7360}, {"loss": 1.3156, "grad_norm": 1.006047010421753, "learning_rate": 0.0002, "epoch": 5.506163615988046, "step": 7370}, {"loss": 1.3107, "grad_norm": 0.9781233072280884, "learning_rate": 0.0002, "epoch": 5.513634665670526, "step": 7380}, {"loss": 1.2703, "grad_norm": 0.9945126175880432, "learning_rate": 0.0002, "epoch": 5.521105715353007, "step": 7390}, {"loss": 1.1936, "grad_norm": 0.9081175327301025, "learning_rate": 0.0002, "epoch": 5.528576765035488, "step": 7400}, {"loss": 1.2651, "grad_norm": 1.2215938568115234, "learning_rate": 0.0002, "epoch": 5.536047814717968, "step": 7410}, {"loss": 1.2484, "grad_norm": 1.0724077224731445, "learning_rate": 0.0002, "epoch": 5.543518864400449, "step": 7420}, {"loss": 1.3083, "grad_norm": 1.106955885887146, "learning_rate": 0.0002, "epoch": 5.550989914082929, "step": 7430}, {"loss": 1.2125, "grad_norm": 1.0657650232315063, "learning_rate": 0.0002, "epoch": 5.558460963765409, "step": 7440}, {"loss": 1.2576, "grad_norm": 0.9725455641746521, "learning_rate": 0.0002, "epoch": 5.565932013447889, "step": 7450}, {"loss": 1.3297, "grad_norm": 0.8604224324226379, "learning_rate": 0.0002, "epoch": 5.57340306313037, "step": 7460}, {"loss": 1.3084, "grad_norm": 0.9913371205329895, "learning_rate": 0.0002, "epoch": 5.58087411281285, "step": 7470}, {"loss": 1.3371, "grad_norm": 1.012073040008545, "learning_rate": 0.0002, "epoch": 5.58834516249533, "step": 7480}, {"loss": 1.2526, "grad_norm": 1.1003159284591675, "learning_rate": 0.0002, "epoch": 5.5958162121778106, "step": 7490}, {"loss": 1.2577, "grad_norm": 0.9104593992233276, "learning_rate": 0.0002, "epoch": 5.603287261860292, "step": 7500}, {"loss": 1.2578, "grad_norm": 0.9480831623077393, "learning_rate": 0.0002, "epoch": 5.610758311542772, "step": 7510}, {"loss": 1.3056, "grad_norm": 1.0826456546783447, "learning_rate": 0.0002, "epoch": 5.618229361225252, "step": 7520}, {"loss": 1.2931, "grad_norm": 0.8286259174346924, "learning_rate": 0.0002, "epoch": 5.625700410907733, "step": 7530}, {"loss": 1.2918, "grad_norm": 0.9145061373710632, "learning_rate": 0.0002, "epoch": 5.633171460590213, "step": 7540}, {"loss": 1.1736, "grad_norm": 0.9363601803779602, "learning_rate": 0.0002, "epoch": 5.640642510272693, "step": 7550}, {"loss": 1.2265, "grad_norm": 0.9553244709968567, "learning_rate": 0.0002, "epoch": 5.648113559955174, "step": 7560}, {"loss": 1.2356, "grad_norm": 1.0343557596206665, "learning_rate": 0.0002, "epoch": 5.655584609637654, "step": 7570}, {"loss": 1.3171, "grad_norm": 0.8734238743782043, "learning_rate": 0.0002, "epoch": 5.663055659320134, "step": 7580}, {"loss": 1.2785, "grad_norm": 1.0230586528778076, "learning_rate": 0.0002, "epoch": 5.670526709002615, "step": 7590}, {"loss": 1.2936, "grad_norm": 1.0063409805297852, "learning_rate": 0.0002, "epoch": 5.677997758685096, "step": 7600}, {"loss": 1.2396, "grad_norm": 1.0104626417160034, "learning_rate": 0.0002, "epoch": 5.685468808367576, "step": 7610}, {"loss": 1.2581, "grad_norm": 0.9528168439865112, "learning_rate": 0.0002, "epoch": 5.692939858050056, "step": 7620}, {"loss": 1.3116, "grad_norm": 0.9799878597259521, "learning_rate": 0.0002, "epoch": 5.700410907732536, "step": 7630}, {"loss": 1.2632, "grad_norm": 0.969351589679718, "learning_rate": 0.0002, "epoch": 5.707881957415017, "step": 7640}, {"loss": 1.3055, "grad_norm": 1.3037652969360352, "learning_rate": 0.0002, "epoch": 5.715353007097497, "step": 7650}, {"loss": 1.3126, "grad_norm": 1.0640486478805542, "learning_rate": 0.0002, "epoch": 5.722824056779977, "step": 7660}, {"loss": 1.3325, "grad_norm": 1.0416420698165894, "learning_rate": 0.0002, "epoch": 5.730295106462458, "step": 7670}, {"loss": 1.25, "grad_norm": 0.8893619775772095, "learning_rate": 0.0002, "epoch": 5.7377661561449385, "step": 7680}, {"loss": 1.319, "grad_norm": 0.8512844443321228, "learning_rate": 0.0002, "epoch": 5.745237205827419, "step": 7690}, {"loss": 1.3328, "grad_norm": 0.9955748319625854, "learning_rate": 0.0002, "epoch": 5.7527082555099, "step": 7700}, {"loss": 1.294, "grad_norm": 1.0409910678863525, "learning_rate": 0.0002, "epoch": 5.76017930519238, "step": 7710}, {"loss": 1.3518, "grad_norm": 1.010097861289978, "learning_rate": 0.0002, "epoch": 5.76765035487486, "step": 7720}, {"loss": 1.2106, "grad_norm": 0.8974892497062683, "learning_rate": 0.0002, "epoch": 5.77512140455734, "step": 7730}, {"loss": 1.2743, "grad_norm": 0.972835123538971, "learning_rate": 0.0002, "epoch": 5.782592454239821, "step": 7740}, {"loss": 1.3549, "grad_norm": 0.9607440829277039, "learning_rate": 0.0002, "epoch": 5.790063503922301, "step": 7750}, {"loss": 1.29, "grad_norm": 0.9426500797271729, "learning_rate": 0.0002, "epoch": 5.797534553604781, "step": 7760}, {"loss": 1.274, "grad_norm": 0.8745320439338684, "learning_rate": 0.0002, "epoch": 5.8050056032872615, "step": 7770}, {"loss": 1.3009, "grad_norm": 1.0117204189300537, "learning_rate": 0.0002, "epoch": 5.8124766529697425, "step": 7780}, {"loss": 1.3135, "grad_norm": 1.0387755632400513, "learning_rate": 0.0002, "epoch": 5.819947702652223, "step": 7790}, {"loss": 1.2709, "grad_norm": 1.0709784030914307, "learning_rate": 0.0002, "epoch": 5.827418752334703, "step": 7800}, {"loss": 1.225, "grad_norm": 0.9512667655944824, "learning_rate": 0.0002, "epoch": 5.834889802017184, "step": 7810}, {"loss": 1.3284, "grad_norm": 1.021094560623169, "learning_rate": 0.0002, "epoch": 5.842360851699664, "step": 7820}, {"loss": 1.2794, "grad_norm": 1.117491364479065, "learning_rate": 0.0002, "epoch": 5.849831901382144, "step": 7830}, {"loss": 1.3646, "grad_norm": 0.9252554178237915, "learning_rate": 0.0002, "epoch": 5.857302951064625, "step": 7840}, {"loss": 1.2976, "grad_norm": 1.1416207551956177, "learning_rate": 0.0002, "epoch": 5.864774000747105, "step": 7850}, {"loss": 1.3293, "grad_norm": 1.1219907999038696, "learning_rate": 0.0002, "epoch": 5.872245050429585, "step": 7860}, {"loss": 1.2334, "grad_norm": 0.8300467729568481, "learning_rate": 0.0002, "epoch": 5.8797161001120655, "step": 7870}, {"loss": 1.3132, "grad_norm": 1.00551438331604, "learning_rate": 0.0002, "epoch": 5.8871871497945465, "step": 7880}, {"loss": 1.2609, "grad_norm": 0.8981153964996338, "learning_rate": 0.0002, "epoch": 5.894658199477027, "step": 7890}, {"loss": 1.2817, "grad_norm": 1.0247976779937744, "learning_rate": 0.0002, "epoch": 5.902129249159507, "step": 7900}, {"loss": 1.2866, "grad_norm": 1.0820319652557373, "learning_rate": 0.0002, "epoch": 5.909600298841987, "step": 7910}, {"loss": 1.2941, "grad_norm": 0.952675461769104, "learning_rate": 0.0002, "epoch": 5.917071348524468, "step": 7920}, {"loss": 1.307, "grad_norm": 0.8666740655899048, "learning_rate": 0.0002, "epoch": 5.924542398206948, "step": 7930}, {"loss": 1.2752, "grad_norm": 0.8640421032905579, "learning_rate": 0.0002, "epoch": 5.932013447889428, "step": 7940}, {"loss": 1.2386, "grad_norm": 1.2343276739120483, "learning_rate": 0.0002, "epoch": 5.939484497571909, "step": 7950}, {"loss": 1.2333, "grad_norm": 0.958046555519104, "learning_rate": 0.0002, "epoch": 5.946955547254389, "step": 7960}, {"loss": 1.2352, "grad_norm": 1.0538510084152222, "learning_rate": 0.0002, "epoch": 5.9544265969368695, "step": 7970}, {"loss": 1.3233, "grad_norm": 1.2681571245193481, "learning_rate": 0.0002, "epoch": 5.9618976466193505, "step": 7980}, {"loss": 1.2514, "grad_norm": 0.8171183466911316, "learning_rate": 0.0002, "epoch": 5.969368696301831, "step": 7990}, {"loss": 1.3412, "grad_norm": 0.9109523892402649, "learning_rate": 0.0002, "epoch": 5.976839745984311, "step": 8000}, {"loss": 1.3497, "grad_norm": 1.0040639638900757, "learning_rate": 0.0002, "epoch": 5.984310795666791, "step": 8010}, {"loss": 1.3299, "grad_norm": 0.9596554040908813, "learning_rate": 0.0002, "epoch": 5.991781845349272, "step": 8020}, {"loss": 1.3109, "grad_norm": 0.9782963991165161, "learning_rate": 0.0002, "epoch": 5.999252895031752, "step": 8030}]} +{"epoch": 6.999626447515876, "step": 9369, "epoch_duration": 1874.6490201950073, "total_accumulated_duration": 11295.583930015564, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6181, "grad_norm": 0.4912872612476349, "learning_rate": 0.0002, "epoch": 0.007471049682480389, "step": 10}, {"loss": 2.2606, "grad_norm": 0.4856316149234772, "learning_rate": 0.0002, "epoch": 0.014942099364960777, "step": 20}, {"loss": 2.0957, "grad_norm": 0.47683125734329224, "learning_rate": 0.0002, "epoch": 0.022413149047441166, "step": 30}, {"loss": 1.8908, "grad_norm": 0.515082597732544, "learning_rate": 0.0002, "epoch": 0.029884198729921554, "step": 40}, {"loss": 1.9704, "grad_norm": 0.5299215316772461, "learning_rate": 0.0002, "epoch": 0.03735524841240194, "step": 50}, {"loss": 1.9225, "grad_norm": 0.4951399862766266, "learning_rate": 0.0002, "epoch": 0.04482629809488233, "step": 60}, {"loss": 1.9742, "grad_norm": 0.48079821467399597, "learning_rate": 0.0002, "epoch": 0.05229734777736272, "step": 70}, {"loss": 1.9466, "grad_norm": 0.49402132630348206, "learning_rate": 0.0002, "epoch": 0.05976839745984311, "step": 80}, {"loss": 1.8691, "grad_norm": 0.4778193235397339, "learning_rate": 0.0002, "epoch": 0.0672394471423235, "step": 90}, {"loss": 1.8455, "grad_norm": 0.42472657561302185, "learning_rate": 0.0002, "epoch": 0.07471049682480388, "step": 100}, {"loss": 1.8744, "grad_norm": 0.4433092474937439, "learning_rate": 0.0002, "epoch": 0.08218154650728428, "step": 110}, {"loss": 1.865, "grad_norm": 0.4472862780094147, "learning_rate": 0.0002, "epoch": 0.08965259618976466, "step": 120}, {"loss": 1.9256, "grad_norm": 0.42596298456192017, "learning_rate": 0.0002, "epoch": 0.09712364587224505, "step": 130}, {"loss": 1.8015, "grad_norm": 0.46645811200141907, "learning_rate": 0.0002, "epoch": 0.10459469555472543, "step": 140}, {"loss": 1.8307, "grad_norm": 0.41041234135627747, "learning_rate": 0.0002, "epoch": 0.11206574523720583, "step": 150}, {"loss": 1.8276, "grad_norm": 0.5329819917678833, "learning_rate": 0.0002, "epoch": 0.11953679491968622, "step": 160}, {"loss": 1.8118, "grad_norm": 0.4065922200679779, "learning_rate": 0.0002, "epoch": 0.1270078446021666, "step": 170}, {"loss": 1.8559, "grad_norm": 0.38406994938850403, "learning_rate": 0.0002, "epoch": 0.134478894284647, "step": 180}, {"loss": 1.8647, "grad_norm": 0.4246881306171417, "learning_rate": 0.0002, "epoch": 0.14194994396712737, "step": 190}, {"loss": 1.8054, "grad_norm": 0.35136649012565613, "learning_rate": 0.0002, "epoch": 0.14942099364960776, "step": 200}, {"loss": 1.802, "grad_norm": 0.43252742290496826, "learning_rate": 0.0002, "epoch": 0.15689204333208817, "step": 210}, {"loss": 1.7823, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002, "epoch": 0.16436309301456856, "step": 220}, {"loss": 1.818, "grad_norm": 0.3748249113559723, "learning_rate": 0.0002, "epoch": 0.17183414269704894, "step": 230}, {"loss": 1.866, "grad_norm": 0.6432855725288391, "learning_rate": 0.0002, "epoch": 0.17930519237952933, "step": 240}, {"loss": 1.8397, "grad_norm": 0.34874802827835083, "learning_rate": 0.0002, "epoch": 0.1867762420620097, "step": 250}, {"loss": 1.79, "grad_norm": 0.3721984326839447, "learning_rate": 0.0002, "epoch": 0.1942472917444901, "step": 260}, {"loss": 1.8464, "grad_norm": 0.4339311420917511, "learning_rate": 0.0002, "epoch": 0.20171834142697048, "step": 270}, {"loss": 1.8665, "grad_norm": 0.4018215537071228, "learning_rate": 0.0002, "epoch": 0.20918939110945087, "step": 280}, {"loss": 1.8048, "grad_norm": 0.3278839886188507, "learning_rate": 0.0002, "epoch": 0.21666044079193125, "step": 290}, {"loss": 1.7395, "grad_norm": 0.36146077513694763, "learning_rate": 0.0002, "epoch": 0.22413149047441167, "step": 300}, {"loss": 1.7916, "grad_norm": 0.38175010681152344, "learning_rate": 0.0002, "epoch": 0.23160254015689205, "step": 310}, {"loss": 1.8593, "grad_norm": 0.44776618480682373, "learning_rate": 0.0002, "epoch": 0.23907358983937244, "step": 320}, {"loss": 1.7824, "grad_norm": 0.3933652937412262, "learning_rate": 0.0002, "epoch": 0.24654463952185282, "step": 330}, {"loss": 1.8393, "grad_norm": 0.3515005111694336, "learning_rate": 0.0002, "epoch": 0.2540156892043332, "step": 340}, {"loss": 1.8653, "grad_norm": 0.6683304309844971, "learning_rate": 0.0002, "epoch": 0.2614867388868136, "step": 350}, {"loss": 1.8797, "grad_norm": 0.37093454599380493, "learning_rate": 0.0002, "epoch": 0.268957788569294, "step": 360}, {"loss": 1.8251, "grad_norm": 0.3450651168823242, "learning_rate": 0.0002, "epoch": 0.2764288382517744, "step": 370}, {"loss": 1.7435, "grad_norm": 0.5140917301177979, "learning_rate": 0.0002, "epoch": 0.28389988793425475, "step": 380}, {"loss": 1.8026, "grad_norm": 0.32885563373565674, "learning_rate": 0.0002, "epoch": 0.29137093761673516, "step": 390}, {"loss": 1.8174, "grad_norm": 0.33962297439575195, "learning_rate": 0.0002, "epoch": 0.2988419872992155, "step": 400}, {"loss": 1.7467, "grad_norm": 0.3723141849040985, "learning_rate": 0.0002, "epoch": 0.30631303698169593, "step": 410}, {"loss": 1.8459, "grad_norm": 0.37173134088516235, "learning_rate": 0.0002, "epoch": 0.31378408666417634, "step": 420}, {"loss": 1.8876, "grad_norm": 0.33736956119537354, "learning_rate": 0.0002, "epoch": 0.3212551363466567, "step": 430}, {"loss": 1.8367, "grad_norm": 0.3602448105812073, "learning_rate": 0.0002, "epoch": 0.3287261860291371, "step": 440}, {"loss": 1.8058, "grad_norm": 0.3569699227809906, "learning_rate": 0.0002, "epoch": 0.33619723571161747, "step": 450}, {"loss": 1.8086, "grad_norm": 0.31009167432785034, "learning_rate": 0.0002, "epoch": 0.3436682853940979, "step": 460}, {"loss": 1.8876, "grad_norm": 0.5278693437576294, "learning_rate": 0.0002, "epoch": 0.35113933507657824, "step": 470}, {"loss": 1.8534, "grad_norm": 0.3587537109851837, "learning_rate": 0.0002, "epoch": 0.35861038475905865, "step": 480}, {"loss": 1.8046, "grad_norm": 0.3859670162200928, "learning_rate": 0.0002, "epoch": 0.366081434441539, "step": 490}, {"loss": 1.8287, "grad_norm": 0.395913690328598, "learning_rate": 0.0002, "epoch": 0.3735524841240194, "step": 500}, {"loss": 1.7619, "grad_norm": 0.35052940249443054, "learning_rate": 0.0002, "epoch": 0.38102353380649984, "step": 510}, {"loss": 1.7824, "grad_norm": 0.2979494333267212, "learning_rate": 0.0002, "epoch": 0.3884945834889802, "step": 520}, {"loss": 1.8641, "grad_norm": 0.3062683343887329, "learning_rate": 0.0002, "epoch": 0.3959656331714606, "step": 530}, {"loss": 1.7651, "grad_norm": 0.3172847330570221, "learning_rate": 0.0002, "epoch": 0.40343668285394096, "step": 540}, {"loss": 1.806, "grad_norm": 0.360435426235199, "learning_rate": 0.0002, "epoch": 0.4109077325364214, "step": 550}, {"loss": 1.9054, "grad_norm": 0.3427872359752655, "learning_rate": 0.0002, "epoch": 0.41837878221890173, "step": 560}, {"loss": 1.7562, "grad_norm": 0.34036558866500854, "learning_rate": 0.0002, "epoch": 0.42584983190138215, "step": 570}, {"loss": 1.7254, "grad_norm": 0.3365345299243927, "learning_rate": 0.0002, "epoch": 0.4333208815838625, "step": 580}, {"loss": 1.8328, "grad_norm": 0.35619041323661804, "learning_rate": 0.0002, "epoch": 0.4407919312663429, "step": 590}, {"loss": 1.8114, "grad_norm": 0.3569088280200958, "learning_rate": 0.0002, "epoch": 0.44826298094882333, "step": 600}, {"loss": 1.8599, "grad_norm": 0.3581278622150421, "learning_rate": 0.0002, "epoch": 0.4557340306313037, "step": 610}, {"loss": 1.7078, "grad_norm": 0.43197110295295715, "learning_rate": 0.0002, "epoch": 0.4632050803137841, "step": 620}, {"loss": 1.8257, "grad_norm": 0.33966198563575745, "learning_rate": 0.0002, "epoch": 0.47067612999626446, "step": 630}, {"loss": 1.7528, "grad_norm": 0.3343866467475891, "learning_rate": 0.0002, "epoch": 0.47814717967874487, "step": 640}, {"loss": 1.8191, "grad_norm": 0.33878564834594727, "learning_rate": 0.0002, "epoch": 0.48561822936122523, "step": 650}, {"loss": 1.8801, "grad_norm": 0.387195885181427, "learning_rate": 0.0002, "epoch": 0.49308927904370564, "step": 660}, {"loss": 1.7559, "grad_norm": 0.3755440413951874, "learning_rate": 0.0002, "epoch": 0.500560328726186, "step": 670}, {"loss": 1.8057, "grad_norm": 0.3272816836833954, "learning_rate": 0.0002, "epoch": 0.5080313784086664, "step": 680}, {"loss": 1.8156, "grad_norm": 0.36063864827156067, "learning_rate": 0.0002, "epoch": 0.5155024280911468, "step": 690}, {"loss": 1.8397, "grad_norm": 0.35317373275756836, "learning_rate": 0.0002, "epoch": 0.5229734777736272, "step": 700}, {"loss": 1.7603, "grad_norm": 0.3561195433139801, "learning_rate": 0.0002, "epoch": 0.5304445274561076, "step": 710}, {"loss": 1.8149, "grad_norm": 0.31124624609947205, "learning_rate": 0.0002, "epoch": 0.537915577138588, "step": 720}, {"loss": 1.7434, "grad_norm": 0.3294544517993927, "learning_rate": 0.0002, "epoch": 0.5453866268210683, "step": 730}, {"loss": 1.8027, "grad_norm": 0.31933900713920593, "learning_rate": 0.0002, "epoch": 0.5528576765035488, "step": 740}, {"loss": 1.7601, "grad_norm": 0.3226020634174347, "learning_rate": 0.0002, "epoch": 0.5603287261860291, "step": 750}, {"loss": 1.7862, "grad_norm": 0.3147525489330292, "learning_rate": 0.0002, "epoch": 0.5677997758685095, "step": 760}, {"loss": 1.9028, "grad_norm": 0.32234328985214233, "learning_rate": 0.0002, "epoch": 0.57527082555099, "step": 770}, {"loss": 1.7623, "grad_norm": 0.3258664309978485, "learning_rate": 0.0002, "epoch": 0.5827418752334703, "step": 780}, {"loss": 1.7384, "grad_norm": 0.3166961967945099, "learning_rate": 0.0002, "epoch": 0.5902129249159507, "step": 790}, {"loss": 1.8799, "grad_norm": 0.35621458292007446, "learning_rate": 0.0002, "epoch": 0.597683974598431, "step": 800}, {"loss": 1.8313, "grad_norm": 0.3236999213695526, "learning_rate": 0.0002, "epoch": 0.6051550242809115, "step": 810}, {"loss": 1.7132, "grad_norm": 0.2892923653125763, "learning_rate": 0.0002, "epoch": 0.6126260739633919, "step": 820}, {"loss": 1.8709, "grad_norm": 0.4098321497440338, "learning_rate": 0.0002, "epoch": 0.6200971236458722, "step": 830}, {"loss": 1.7637, "grad_norm": 0.3337118923664093, "learning_rate": 0.0002, "epoch": 0.6275681733283527, "step": 840}, {"loss": 1.7375, "grad_norm": 0.30416029691696167, "learning_rate": 0.0002, "epoch": 0.635039223010833, "step": 850}, {"loss": 1.7419, "grad_norm": 0.3361026346683502, "learning_rate": 0.0002, "epoch": 0.6425102726933134, "step": 860}, {"loss": 1.732, "grad_norm": 0.3537365198135376, "learning_rate": 0.0002, "epoch": 0.6499813223757938, "step": 870}, {"loss": 1.7825, "grad_norm": 0.33854469656944275, "learning_rate": 0.0002, "epoch": 0.6574523720582742, "step": 880}, {"loss": 1.7561, "grad_norm": 0.3332272469997406, "learning_rate": 0.0002, "epoch": 0.6649234217407546, "step": 890}, {"loss": 1.7247, "grad_norm": 0.34954726696014404, "learning_rate": 0.0002, "epoch": 0.6723944714232349, "step": 900}, {"loss": 1.7917, "grad_norm": 0.2921750247478485, "learning_rate": 0.0002, "epoch": 0.6798655211057153, "step": 910}, {"loss": 1.7807, "grad_norm": 0.30508682131767273, "learning_rate": 0.0002, "epoch": 0.6873365707881958, "step": 920}, {"loss": 1.8082, "grad_norm": 0.32268425822257996, "learning_rate": 0.0002, "epoch": 0.6948076204706761, "step": 930}, {"loss": 1.8283, "grad_norm": 0.2844390869140625, "learning_rate": 0.0002, "epoch": 0.7022786701531565, "step": 940}, {"loss": 1.7363, "grad_norm": 0.31263890862464905, "learning_rate": 0.0002, "epoch": 0.709749719835637, "step": 950}, {"loss": 1.8081, "grad_norm": 0.3626808822154999, "learning_rate": 0.0002, "epoch": 0.7172207695181173, "step": 960}, {"loss": 1.853, "grad_norm": 0.3322749733924866, "learning_rate": 0.0002, "epoch": 0.7246918192005977, "step": 970}, {"loss": 1.7912, "grad_norm": 0.29177871346473694, "learning_rate": 0.0002, "epoch": 0.732162868883078, "step": 980}, {"loss": 1.8447, "grad_norm": 0.35405513644218445, "learning_rate": 0.0002, "epoch": 0.7396339185655585, "step": 990}, {"loss": 1.7008, "grad_norm": 0.39318400621414185, "learning_rate": 0.0002, "epoch": 0.7471049682480388, "step": 1000}, {"loss": 1.7803, "grad_norm": 0.29401418566703796, "learning_rate": 0.0002, "epoch": 0.7545760179305192, "step": 1010}, {"loss": 1.7649, "grad_norm": 0.3271748721599579, "learning_rate": 0.0002, "epoch": 0.7620470676129997, "step": 1020}, {"loss": 1.7266, "grad_norm": 0.30883970856666565, "learning_rate": 0.0002, "epoch": 0.76951811729548, "step": 1030}, {"loss": 1.7722, "grad_norm": 0.3411838412284851, "learning_rate": 0.0002, "epoch": 0.7769891669779604, "step": 1040}, {"loss": 1.829, "grad_norm": 0.30608129501342773, "learning_rate": 0.0002, "epoch": 0.7844602166604407, "step": 1050}, {"loss": 1.7815, "grad_norm": 0.30899080634117126, "learning_rate": 0.0002, "epoch": 0.7919312663429212, "step": 1060}, {"loss": 1.7625, "grad_norm": 0.3160453140735626, "learning_rate": 0.0002, "epoch": 0.7994023160254016, "step": 1070}, {"loss": 1.8452, "grad_norm": 0.30947187542915344, "learning_rate": 0.0002, "epoch": 0.8068733657078819, "step": 1080}, {"loss": 1.7418, "grad_norm": 0.3103134036064148, "learning_rate": 0.0002, "epoch": 0.8143444153903624, "step": 1090}, {"loss": 1.842, "grad_norm": 0.31771138310432434, "learning_rate": 0.0002, "epoch": 0.8218154650728428, "step": 1100}, {"loss": 1.7918, "grad_norm": 0.5860997438430786, "learning_rate": 0.0002, "epoch": 0.8292865147553231, "step": 1110}, {"loss": 1.8443, "grad_norm": 0.3230148255825043, "learning_rate": 0.0002, "epoch": 0.8367575644378035, "step": 1120}, {"loss": 1.8478, "grad_norm": 0.29611510038375854, "learning_rate": 0.0002, "epoch": 0.8442286141202839, "step": 1130}, {"loss": 1.7673, "grad_norm": 0.3373654782772064, "learning_rate": 0.0002, "epoch": 0.8516996638027643, "step": 1140}, {"loss": 1.7997, "grad_norm": 0.3474279046058655, "learning_rate": 0.0002, "epoch": 0.8591707134852447, "step": 1150}, {"loss": 1.75, "grad_norm": 0.35057875514030457, "learning_rate": 0.0002, "epoch": 0.866641763167725, "step": 1160}, {"loss": 1.8273, "grad_norm": 0.39537495374679565, "learning_rate": 0.0002, "epoch": 0.8741128128502055, "step": 1170}, {"loss": 1.7682, "grad_norm": 0.3714233636856079, "learning_rate": 0.0002, "epoch": 0.8815838625326858, "step": 1180}, {"loss": 1.7549, "grad_norm": 0.2950296998023987, "learning_rate": 0.0002, "epoch": 0.8890549122151662, "step": 1190}, {"loss": 1.7612, "grad_norm": 0.38182979822158813, "learning_rate": 0.0002, "epoch": 0.8965259618976467, "step": 1200}, {"loss": 1.827, "grad_norm": 0.27883678674697876, "learning_rate": 0.0002, "epoch": 0.903997011580127, "step": 1210}, {"loss": 1.7623, "grad_norm": 0.33874374628067017, "learning_rate": 0.0002, "epoch": 0.9114680612626074, "step": 1220}, {"loss": 1.7334, "grad_norm": 0.3014272153377533, "learning_rate": 0.0002, "epoch": 0.9189391109450877, "step": 1230}, {"loss": 1.8235, "grad_norm": 0.3194271922111511, "learning_rate": 0.0002, "epoch": 0.9264101606275682, "step": 1240}, {"loss": 1.7924, "grad_norm": 0.3049403429031372, "learning_rate": 0.0002, "epoch": 0.9338812103100486, "step": 1250}, {"loss": 1.7535, "grad_norm": 0.30621254444122314, "learning_rate": 0.0002, "epoch": 0.9413522599925289, "step": 1260}, {"loss": 1.8287, "grad_norm": 0.28675132989883423, "learning_rate": 0.0002, "epoch": 0.9488233096750094, "step": 1270}, {"loss": 1.7586, "grad_norm": 0.3322032690048218, "learning_rate": 0.0002, "epoch": 0.9562943593574897, "step": 1280}, {"loss": 1.8054, "grad_norm": 0.35408294200897217, "learning_rate": 0.0002, "epoch": 0.9637654090399701, "step": 1290}, {"loss": 1.7343, "grad_norm": 0.36386919021606445, "learning_rate": 0.0002, "epoch": 0.9712364587224505, "step": 1300}, {"loss": 1.8633, "grad_norm": 0.32338324189186096, "learning_rate": 0.0002, "epoch": 0.9787075084049309, "step": 1310}, {"loss": 1.7724, "grad_norm": 0.3714013993740082, "learning_rate": 0.0002, "epoch": 0.9861785580874113, "step": 1320}, {"loss": 1.7766, "grad_norm": 0.3133082389831543, "learning_rate": 0.0002, "epoch": 0.9936496077698916, "step": 1330}, {"eval_loss": 1.8051470518112183, "eval_runtime": 38.6332, "eval_samples_per_second": 13.331, "eval_steps_per_second": 1.682, "epoch": 0.9996264475158759, "step": 1338}, {"loss": 1.8035, "grad_norm": 0.31595754623413086, "learning_rate": 0.0002, "epoch": 1.001120657452372, "step": 1340}, {"loss": 1.7486, "grad_norm": 0.3095700144767761, "learning_rate": 0.0002, "epoch": 1.0085917071348525, "step": 1350}, {"loss": 1.6981, "grad_norm": 0.34677496552467346, "learning_rate": 0.0002, "epoch": 1.0160627568173328, "step": 1360}, {"loss": 1.7377, "grad_norm": 0.29108840227127075, "learning_rate": 0.0002, "epoch": 1.0235338064998132, "step": 1370}, {"loss": 1.7194, "grad_norm": 0.32356950640678406, "learning_rate": 0.0002, "epoch": 1.0310048561822935, "step": 1380}, {"loss": 1.7593, "grad_norm": 0.4200669229030609, "learning_rate": 0.0002, "epoch": 1.038475905864774, "step": 1390}, {"loss": 1.797, "grad_norm": 0.3283711373806, "learning_rate": 0.0002, "epoch": 1.0459469555472545, "step": 1400}, {"loss": 1.7163, "grad_norm": 0.32898256182670593, "learning_rate": 0.0002, "epoch": 1.0534180052297348, "step": 1410}, {"loss": 1.7559, "grad_norm": 0.38790300488471985, "learning_rate": 0.0002, "epoch": 1.0608890549122152, "step": 1420}, {"loss": 1.6922, "grad_norm": 0.339800089597702, "learning_rate": 0.0002, "epoch": 1.0683601045946955, "step": 1430}, {"loss": 1.7076, "grad_norm": 0.3548751175403595, "learning_rate": 0.0002, "epoch": 1.075831154277176, "step": 1440}, {"loss": 1.6985, "grad_norm": 0.35114359855651855, "learning_rate": 0.0002, "epoch": 1.0833022039596563, "step": 1450}, {"loss": 1.7217, "grad_norm": 0.35226720571517944, "learning_rate": 0.0002, "epoch": 1.0907732536421366, "step": 1460}, {"loss": 1.6822, "grad_norm": 0.33665576577186584, "learning_rate": 0.0002, "epoch": 1.0982443033246172, "step": 1470}, {"loss": 1.6699, "grad_norm": 0.363889217376709, "learning_rate": 0.0002, "epoch": 1.1057153530070976, "step": 1480}, {"loss": 1.7933, "grad_norm": 0.3826201856136322, "learning_rate": 0.0002, "epoch": 1.113186402689578, "step": 1490}, {"loss": 1.7022, "grad_norm": 0.34058740735054016, "learning_rate": 0.0002, "epoch": 1.1206574523720583, "step": 1500}, {"loss": 1.6375, "grad_norm": 0.3462134301662445, "learning_rate": 0.0002, "epoch": 1.1281285020545386, "step": 1510}, {"loss": 1.7147, "grad_norm": 0.3396756052970886, "learning_rate": 0.0002, "epoch": 1.135599551737019, "step": 1520}, {"loss": 1.7219, "grad_norm": 0.32004743814468384, "learning_rate": 0.0002, "epoch": 1.1430706014194993, "step": 1530}, {"loss": 1.743, "grad_norm": 0.3397733271121979, "learning_rate": 0.0002, "epoch": 1.15054165110198, "step": 1540}, {"loss": 1.7333, "grad_norm": 0.3783262073993683, "learning_rate": 0.0002, "epoch": 1.1580127007844603, "step": 1550}, {"loss": 1.6075, "grad_norm": 0.35121291875839233, "learning_rate": 0.0002, "epoch": 1.1654837504669406, "step": 1560}, {"loss": 1.678, "grad_norm": 0.35816895961761475, "learning_rate": 0.0002, "epoch": 1.172954800149421, "step": 1570}, {"loss": 1.7143, "grad_norm": 0.33843839168548584, "learning_rate": 0.0002, "epoch": 1.1804258498319014, "step": 1580}, {"loss": 1.7434, "grad_norm": 0.3371972143650055, "learning_rate": 0.0002, "epoch": 1.1878968995143817, "step": 1590}, {"loss": 1.7671, "grad_norm": 0.36016878485679626, "learning_rate": 0.0002, "epoch": 1.195367949196862, "step": 1600}, {"loss": 1.6914, "grad_norm": 0.40879473090171814, "learning_rate": 0.0002, "epoch": 1.2028389988793426, "step": 1610}, {"loss": 1.6955, "grad_norm": 0.3216715455055237, "learning_rate": 0.0002, "epoch": 1.210310048561823, "step": 1620}, {"loss": 1.632, "grad_norm": 0.4482610821723938, "learning_rate": 0.0002, "epoch": 1.2177810982443034, "step": 1630}, {"loss": 1.6999, "grad_norm": 0.3257700502872467, "learning_rate": 0.0002, "epoch": 1.2252521479267837, "step": 1640}, {"loss": 1.7177, "grad_norm": 0.38646459579467773, "learning_rate": 0.0002, "epoch": 1.232723197609264, "step": 1650}, {"loss": 1.7081, "grad_norm": 0.4081360697746277, "learning_rate": 0.0002, "epoch": 1.2401942472917444, "step": 1660}, {"loss": 1.7519, "grad_norm": 0.4326848089694977, "learning_rate": 0.0002, "epoch": 1.2476652969742248, "step": 1670}, {"loss": 1.6752, "grad_norm": 0.346401572227478, "learning_rate": 0.0002, "epoch": 1.2551363466567054, "step": 1680}, {"loss": 1.7425, "grad_norm": 0.34536251425743103, "learning_rate": 0.0002, "epoch": 1.2626073963391857, "step": 1690}, {"loss": 1.7061, "grad_norm": 0.41359591484069824, "learning_rate": 0.0002, "epoch": 1.270078446021666, "step": 1700}, {"loss": 1.7906, "grad_norm": 0.3530874252319336, "learning_rate": 0.0002, "epoch": 1.2775494957041464, "step": 1710}, {"loss": 1.7357, "grad_norm": 0.3702719211578369, "learning_rate": 0.0002, "epoch": 1.2850205453866268, "step": 1720}, {"loss": 1.766, "grad_norm": 0.3703329563140869, "learning_rate": 0.0002, "epoch": 1.2924915950691072, "step": 1730}, {"loss": 1.7221, "grad_norm": 0.37919729948043823, "learning_rate": 0.0002, "epoch": 1.2999626447515875, "step": 1740}, {"loss": 1.7859, "grad_norm": 0.32526856660842896, "learning_rate": 0.0002, "epoch": 1.307433694434068, "step": 1750}, {"loss": 1.7117, "grad_norm": 0.36752620339393616, "learning_rate": 0.0002, "epoch": 1.3149047441165485, "step": 1760}, {"loss": 1.7335, "grad_norm": 0.3398192524909973, "learning_rate": 0.0002, "epoch": 1.3223757937990288, "step": 1770}, {"loss": 1.7492, "grad_norm": 0.37435585260391235, "learning_rate": 0.0002, "epoch": 1.3298468434815092, "step": 1780}, {"loss": 1.7393, "grad_norm": 0.35793280601501465, "learning_rate": 0.0002, "epoch": 1.3373178931639895, "step": 1790}, {"loss": 1.7266, "grad_norm": 0.35481882095336914, "learning_rate": 0.0002, "epoch": 1.3447889428464699, "step": 1800}, {"loss": 1.7456, "grad_norm": 0.3786393105983734, "learning_rate": 0.0002, "epoch": 1.3522599925289502, "step": 1810}, {"loss": 1.7169, "grad_norm": 0.33245593309402466, "learning_rate": 0.0002, "epoch": 1.3597310422114308, "step": 1820}, {"loss": 1.7577, "grad_norm": 0.35388344526290894, "learning_rate": 0.0002, "epoch": 1.3672020918939112, "step": 1830}, {"loss": 1.6968, "grad_norm": 0.3695325553417206, "learning_rate": 0.0002, "epoch": 1.3746731415763915, "step": 1840}, {"loss": 1.7086, "grad_norm": 0.3683604598045349, "learning_rate": 0.0002, "epoch": 1.382144191258872, "step": 1850}, {"loss": 1.7878, "grad_norm": 0.3753012418746948, "learning_rate": 0.0002, "epoch": 1.3896152409413522, "step": 1860}, {"loss": 1.6969, "grad_norm": 0.3331069350242615, "learning_rate": 0.0002, "epoch": 1.3970862906238326, "step": 1870}, {"loss": 1.6644, "grad_norm": 0.3877500295639038, "learning_rate": 0.0002, "epoch": 1.404557340306313, "step": 1880}, {"loss": 1.7586, "grad_norm": 0.33525151014328003, "learning_rate": 0.0002, "epoch": 1.4120283899887935, "step": 1890}, {"loss": 1.7031, "grad_norm": 0.3697299659252167, "learning_rate": 0.0002, "epoch": 1.4194994396712737, "step": 1900}, {"loss": 1.6956, "grad_norm": 0.4029286205768585, "learning_rate": 0.0002, "epoch": 1.4269704893537543, "step": 1910}, {"loss": 1.6897, "grad_norm": 0.3596203029155731, "learning_rate": 0.0002, "epoch": 1.4344415390362346, "step": 1920}, {"loss": 1.7139, "grad_norm": 0.450783908367157, "learning_rate": 0.0002, "epoch": 1.441912588718715, "step": 1930}, {"loss": 1.7243, "grad_norm": 0.3651481866836548, "learning_rate": 0.0002, "epoch": 1.4493836384011953, "step": 1940}, {"loss": 1.6637, "grad_norm": 0.3608424663543701, "learning_rate": 0.0002, "epoch": 1.4568546880836757, "step": 1950}, {"loss": 1.8285, "grad_norm": 0.39684420824050903, "learning_rate": 0.0002, "epoch": 1.4643257377661563, "step": 1960}, {"loss": 1.7514, "grad_norm": 0.34618663787841797, "learning_rate": 0.0002, "epoch": 1.4717967874486364, "step": 1970}, {"loss": 1.6655, "grad_norm": 0.4150386452674866, "learning_rate": 0.0002, "epoch": 1.479267837131117, "step": 1980}, {"loss": 1.7021, "grad_norm": 0.35500776767730713, "learning_rate": 0.0002, "epoch": 1.4867388868135973, "step": 1990}, {"loss": 1.7322, "grad_norm": 0.344144344329834, "learning_rate": 0.0002, "epoch": 1.4942099364960777, "step": 2000}, {"loss": 1.6998, "grad_norm": 0.3340149223804474, "learning_rate": 0.0002, "epoch": 1.501680986178558, "step": 2010}, {"loss": 1.7508, "grad_norm": 0.37685006856918335, "learning_rate": 0.0002, "epoch": 1.5091520358610384, "step": 2020}, {"loss": 1.8299, "grad_norm": 0.3699876368045807, "learning_rate": 0.0002, "epoch": 1.516623085543519, "step": 2030}, {"loss": 1.7357, "grad_norm": 0.3370307385921478, "learning_rate": 0.0002, "epoch": 1.5240941352259991, "step": 2040}, {"loss": 1.8044, "grad_norm": 0.37780630588531494, "learning_rate": 0.0002, "epoch": 1.5315651849084797, "step": 2050}, {"loss": 1.7408, "grad_norm": 0.370259165763855, "learning_rate": 0.0002, "epoch": 1.53903623459096, "step": 2060}, {"loss": 1.7398, "grad_norm": 0.3440011441707611, "learning_rate": 0.0002, "epoch": 1.5465072842734404, "step": 2070}, {"loss": 1.7105, "grad_norm": 0.40382063388824463, "learning_rate": 0.0002, "epoch": 1.5539783339559208, "step": 2080}, {"loss": 1.7071, "grad_norm": 0.38002029061317444, "learning_rate": 0.0002, "epoch": 1.5614493836384011, "step": 2090}, {"loss": 1.6815, "grad_norm": 0.3658451437950134, "learning_rate": 0.0002, "epoch": 1.5689204333208817, "step": 2100}, {"loss": 1.7598, "grad_norm": 0.354842871427536, "learning_rate": 0.0002, "epoch": 1.5763914830033618, "step": 2110}, {"loss": 1.6898, "grad_norm": 0.34735530614852905, "learning_rate": 0.0002, "epoch": 1.5838625326858424, "step": 2120}, {"loss": 1.7363, "grad_norm": 0.377581924200058, "learning_rate": 0.0002, "epoch": 1.5913335823683228, "step": 2130}, {"loss": 1.7789, "grad_norm": 0.41254034638404846, "learning_rate": 0.0002, "epoch": 1.5988046320508031, "step": 2140}, {"loss": 1.6782, "grad_norm": 0.3630715310573578, "learning_rate": 0.0002, "epoch": 1.6062756817332835, "step": 2150}, {"loss": 1.7531, "grad_norm": 0.36980143189430237, "learning_rate": 0.0002, "epoch": 1.6137467314157639, "step": 2160}, {"loss": 1.6847, "grad_norm": 0.3634769320487976, "learning_rate": 0.0002, "epoch": 1.6212177810982444, "step": 2170}, {"loss": 1.6367, "grad_norm": 0.3794139623641968, "learning_rate": 0.0002, "epoch": 1.6286888307807246, "step": 2180}, {"loss": 1.7064, "grad_norm": 0.359742134809494, "learning_rate": 0.0002, "epoch": 1.6361598804632052, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3770543932914734, "learning_rate": 0.0002, "epoch": 1.6436309301456855, "step": 2200}, {"loss": 1.784, "grad_norm": 0.3797036409378052, "learning_rate": 0.0002, "epoch": 1.6511019798281659, "step": 2210}, {"loss": 1.7875, "grad_norm": 0.35622093081474304, "learning_rate": 0.0002, "epoch": 1.6585730295106462, "step": 2220}, {"loss": 1.6615, "grad_norm": 0.34552520513534546, "learning_rate": 0.0002, "epoch": 1.6660440791931266, "step": 2230}, {"loss": 1.7522, "grad_norm": 0.379926860332489, "learning_rate": 0.0002, "epoch": 1.6735151288756072, "step": 2240}, {"loss": 1.7953, "grad_norm": 0.37083810567855835, "learning_rate": 0.0002, "epoch": 1.6809861785580873, "step": 2250}, {"loss": 1.7485, "grad_norm": 0.42746543884277344, "learning_rate": 0.0002, "epoch": 1.6884572282405679, "step": 2260}, {"loss": 1.776, "grad_norm": 0.3372884690761566, "learning_rate": 0.0002, "epoch": 1.6959282779230482, "step": 2270}, {"loss": 1.7604, "grad_norm": 0.35220256447792053, "learning_rate": 0.0002, "epoch": 1.7033993276055286, "step": 2280}, {"loss": 1.7154, "grad_norm": 0.3659130930900574, "learning_rate": 0.0002, "epoch": 1.710870377288009, "step": 2290}, {"loss": 1.6953, "grad_norm": 0.37629297375679016, "learning_rate": 0.0002, "epoch": 1.7183414269704893, "step": 2300}, {"loss": 1.7212, "grad_norm": 0.36312398314476013, "learning_rate": 0.0002, "epoch": 1.7258124766529699, "step": 2310}, {"loss": 1.7903, "grad_norm": 0.467709481716156, "learning_rate": 0.0002, "epoch": 1.73328352633545, "step": 2320}, {"loss": 1.696, "grad_norm": 0.38685527443885803, "learning_rate": 0.0002, "epoch": 1.7407545760179306, "step": 2330}, {"loss": 1.7041, "grad_norm": 0.3578338325023651, "learning_rate": 0.0002, "epoch": 1.748225625700411, "step": 2340}, {"loss": 1.6456, "grad_norm": 0.36057502031326294, "learning_rate": 0.0002, "epoch": 1.7556966753828913, "step": 2350}, {"loss": 1.6853, "grad_norm": 0.3615196645259857, "learning_rate": 0.0002, "epoch": 1.7631677250653717, "step": 2360}, {"loss": 1.7612, "grad_norm": 0.4118947684764862, "learning_rate": 0.0002, "epoch": 1.770638774747852, "step": 2370}, {"loss": 1.6946, "grad_norm": 0.4067276120185852, "learning_rate": 0.0002, "epoch": 1.7781098244303326, "step": 2380}, {"loss": 1.712, "grad_norm": 0.3979823887348175, "learning_rate": 0.0002, "epoch": 1.7855808741128127, "step": 2390}, {"loss": 1.7644, "grad_norm": 0.44045883417129517, "learning_rate": 0.0002, "epoch": 1.7930519237952933, "step": 2400}, {"loss": 1.7251, "grad_norm": 0.3998069167137146, "learning_rate": 0.0002, "epoch": 1.8005229734777737, "step": 2410}, {"loss": 1.7354, "grad_norm": 0.3450094759464264, "learning_rate": 0.0002, "epoch": 1.807994023160254, "step": 2420}, {"loss": 1.6998, "grad_norm": 0.3759009838104248, "learning_rate": 0.0002, "epoch": 1.8154650728427344, "step": 2430}, {"loss": 1.7706, "grad_norm": 0.34347015619277954, "learning_rate": 0.0002, "epoch": 1.8229361225252148, "step": 2440}, {"loss": 1.7345, "grad_norm": 0.3511228859424591, "learning_rate": 0.0002, "epoch": 1.8304071722076953, "step": 2450}, {"loss": 1.6909, "grad_norm": 0.36853715777397156, "learning_rate": 0.0002, "epoch": 1.8378782218901755, "step": 2460}, {"loss": 1.6931, "grad_norm": 0.40659376978874207, "learning_rate": 0.0002, "epoch": 1.845349271572656, "step": 2470}, {"loss": 1.7626, "grad_norm": 0.39621320366859436, "learning_rate": 0.0002, "epoch": 1.8528203212551362, "step": 2480}, {"loss": 1.7427, "grad_norm": 0.3753979504108429, "learning_rate": 0.0002, "epoch": 1.8602913709376168, "step": 2490}, {"loss": 1.6622, "grad_norm": 0.3811938464641571, "learning_rate": 0.0002, "epoch": 1.8677624206200971, "step": 2500}, {"loss": 1.7718, "grad_norm": 0.3432596027851105, "learning_rate": 0.0002, "epoch": 1.8752334703025775, "step": 2510}, {"loss": 1.7488, "grad_norm": 0.3670712113380432, "learning_rate": 0.0002, "epoch": 1.882704519985058, "step": 2520}, {"loss": 1.705, "grad_norm": 0.40907177329063416, "learning_rate": 0.0002, "epoch": 1.8901755696675382, "step": 2530}, {"loss": 1.7148, "grad_norm": 0.3821999728679657, "learning_rate": 0.0002, "epoch": 1.8976466193500188, "step": 2540}, {"loss": 1.7934, "grad_norm": 0.36173978447914124, "learning_rate": 0.0002, "epoch": 1.905117669032499, "step": 2550}, {"loss": 1.6939, "grad_norm": 0.38990336656570435, "learning_rate": 0.0002, "epoch": 1.9125887187149795, "step": 2560}, {"loss": 1.6893, "grad_norm": 0.35242322087287903, "learning_rate": 0.0002, "epoch": 1.9200597683974598, "step": 2570}, {"loss": 1.7268, "grad_norm": 0.3506428003311157, "learning_rate": 0.0002, "epoch": 1.9275308180799402, "step": 2580}, {"loss": 1.6953, "grad_norm": 0.39540135860443115, "learning_rate": 0.0002, "epoch": 1.9350018677624208, "step": 2590}, {"loss": 1.6511, "grad_norm": 0.3444725573062897, "learning_rate": 0.0002, "epoch": 1.942472917444901, "step": 2600}, {"loss": 1.7259, "grad_norm": 0.3963521718978882, "learning_rate": 0.0002, "epoch": 1.9499439671273815, "step": 2610}, {"loss": 1.6946, "grad_norm": 0.3689815402030945, "learning_rate": 0.0002, "epoch": 1.9574150168098616, "step": 2620}, {"loss": 1.7384, "grad_norm": 0.3482626676559448, "learning_rate": 0.0002, "epoch": 1.9648860664923422, "step": 2630}, {"loss": 1.7048, "grad_norm": 0.35832616686820984, "learning_rate": 0.0002, "epoch": 1.9723571161748226, "step": 2640}, {"loss": 1.6681, "grad_norm": 0.4776208996772766, "learning_rate": 0.0002, "epoch": 1.979828165857303, "step": 2650}, {"loss": 1.6696, "grad_norm": 0.32570165395736694, "learning_rate": 0.0002, "epoch": 1.9872992155397835, "step": 2660}, {"loss": 1.7232, "grad_norm": 0.3380725085735321, "learning_rate": 0.0002, "epoch": 1.9947702652222636, "step": 2670}, {"eval_loss": 1.8046749830245972, "eval_runtime": 38.5096, "eval_samples_per_second": 13.373, "eval_steps_per_second": 1.688, "epoch": 2.0, "step": 2677}, {"loss": 1.7265, "grad_norm": 0.36817631125450134, "learning_rate": 0.0002, "epoch": 2.002241314904744, "step": 2680}, {"loss": 1.548, "grad_norm": 0.4056456685066223, "learning_rate": 0.0002, "epoch": 2.0097123645872244, "step": 2690}, {"loss": 1.5515, "grad_norm": 0.37416863441467285, "learning_rate": 0.0002, "epoch": 2.017183414269705, "step": 2700}, {"loss": 1.5895, "grad_norm": 0.4273638427257538, "learning_rate": 0.0002, "epoch": 2.024654463952185, "step": 2710}, {"loss": 1.5884, "grad_norm": 0.36497923731803894, "learning_rate": 0.0002, "epoch": 2.0321255136346656, "step": 2720}, {"loss": 1.6999, "grad_norm": 0.5021994113922119, "learning_rate": 0.0002, "epoch": 2.0395965633171462, "step": 2730}, {"loss": 1.6655, "grad_norm": 0.45896220207214355, "learning_rate": 0.0002, "epoch": 2.0470676129996264, "step": 2740}, {"loss": 1.6305, "grad_norm": 0.3973815143108368, "learning_rate": 0.0002, "epoch": 2.054538662682107, "step": 2750}, {"loss": 1.6301, "grad_norm": 0.4521815776824951, "learning_rate": 0.0002, "epoch": 2.062009712364587, "step": 2760}, {"loss": 1.6189, "grad_norm": 0.42775002121925354, "learning_rate": 0.0002, "epoch": 2.0694807620470677, "step": 2770}, {"loss": 1.6491, "grad_norm": 0.48158586025238037, "learning_rate": 0.0002, "epoch": 2.076951811729548, "step": 2780}, {"loss": 1.6301, "grad_norm": 0.4612371623516083, "learning_rate": 0.0002, "epoch": 2.0844228614120284, "step": 2790}, {"loss": 1.6327, "grad_norm": 0.42536866664886475, "learning_rate": 0.0002, "epoch": 2.091893911094509, "step": 2800}, {"loss": 1.651, "grad_norm": 0.48515772819519043, "learning_rate": 0.0002, "epoch": 2.099364960776989, "step": 2810}, {"loss": 1.6829, "grad_norm": 0.41418662667274475, "learning_rate": 0.0002, "epoch": 2.1068360104594697, "step": 2820}, {"loss": 1.6266, "grad_norm": 0.4683697819709778, "learning_rate": 0.0002, "epoch": 2.11430706014195, "step": 2830}, {"loss": 1.6586, "grad_norm": 0.4484657049179077, "learning_rate": 0.0002, "epoch": 2.1217781098244304, "step": 2840}, {"loss": 1.6483, "grad_norm": 0.6621400713920593, "learning_rate": 0.0002, "epoch": 2.1292491595069105, "step": 2850}, {"loss": 1.5755, "grad_norm": 0.45074811577796936, "learning_rate": 0.0002, "epoch": 2.136720209189391, "step": 2860}, {"loss": 1.6456, "grad_norm": 0.3513113558292389, "learning_rate": 0.0002, "epoch": 2.1441912588718717, "step": 2870}, {"loss": 1.6081, "grad_norm": 0.40411314368247986, "learning_rate": 0.0002, "epoch": 2.151662308554352, "step": 2880}, {"loss": 1.6323, "grad_norm": 0.4121065139770508, "learning_rate": 0.0002, "epoch": 2.1591333582368324, "step": 2890}, {"loss": 1.6324, "grad_norm": 0.44723689556121826, "learning_rate": 0.0002, "epoch": 2.1666044079193125, "step": 2900}, {"loss": 1.5699, "grad_norm": 0.4226122498512268, "learning_rate": 0.0002, "epoch": 2.174075457601793, "step": 2910}, {"loss": 1.5652, "grad_norm": 0.46617650985717773, "learning_rate": 0.0002, "epoch": 2.1815465072842732, "step": 2920}, {"loss": 1.6378, "grad_norm": 0.4506422281265259, "learning_rate": 0.0002, "epoch": 2.189017556966754, "step": 2930}, {"loss": 1.6112, "grad_norm": 0.4892672896385193, "learning_rate": 0.0002, "epoch": 2.1964886066492344, "step": 2940}, {"loss": 1.6176, "grad_norm": 0.44095516204833984, "learning_rate": 0.0002, "epoch": 2.2039596563317145, "step": 2950}, {"loss": 1.6058, "grad_norm": 0.41522109508514404, "learning_rate": 0.0002, "epoch": 2.211430706014195, "step": 2960}, {"loss": 1.5964, "grad_norm": 0.4860858917236328, "learning_rate": 0.0002, "epoch": 2.2189017556966752, "step": 2970}, {"loss": 1.6427, "grad_norm": 0.42662516236305237, "learning_rate": 0.0002, "epoch": 2.226372805379156, "step": 2980}, {"loss": 1.6313, "grad_norm": 0.4390648305416107, "learning_rate": 0.0002, "epoch": 2.233843855061636, "step": 2990}, {"loss": 1.5992, "grad_norm": 0.47515565156936646, "learning_rate": 0.0002, "epoch": 2.2413149047441165, "step": 3000}, {"loss": 1.5563, "grad_norm": 0.4104543924331665, "learning_rate": 0.0002, "epoch": 2.248785954426597, "step": 3010}, {"loss": 1.6895, "grad_norm": 0.4404028654098511, "learning_rate": 0.0002, "epoch": 2.2562570041090773, "step": 3020}, {"loss": 1.6088, "grad_norm": 0.4717366695404053, "learning_rate": 0.0002, "epoch": 2.263728053791558, "step": 3030}, {"loss": 1.7287, "grad_norm": 0.48345857858657837, "learning_rate": 0.0002, "epoch": 2.271199103474038, "step": 3040}, {"loss": 1.681, "grad_norm": 0.5312452912330627, "learning_rate": 0.0002, "epoch": 2.2786701531565186, "step": 3050}, {"loss": 1.5901, "grad_norm": 0.5073099732398987, "learning_rate": 0.0002, "epoch": 2.2861412028389987, "step": 3060}, {"loss": 1.6914, "grad_norm": 0.5027463436126709, "learning_rate": 0.0002, "epoch": 2.2936122525214793, "step": 3070}, {"loss": 1.5862, "grad_norm": 0.5436304807662964, "learning_rate": 0.0002, "epoch": 2.30108330220396, "step": 3080}, {"loss": 1.5763, "grad_norm": 0.4701065123081207, "learning_rate": 0.0002, "epoch": 2.30855435188644, "step": 3090}, {"loss": 1.6177, "grad_norm": 0.46988746523857117, "learning_rate": 0.0002, "epoch": 2.3160254015689206, "step": 3100}, {"loss": 1.6502, "grad_norm": 0.45112869143486023, "learning_rate": 0.0002, "epoch": 2.3234964512514007, "step": 3110}, {"loss": 1.6291, "grad_norm": 0.5173566937446594, "learning_rate": 0.0002, "epoch": 2.3309675009338813, "step": 3120}, {"loss": 1.6743, "grad_norm": 0.40345850586891174, "learning_rate": 0.0002, "epoch": 2.3384385506163614, "step": 3130}, {"loss": 1.621, "grad_norm": 0.4218924939632416, "learning_rate": 0.0002, "epoch": 2.345909600298842, "step": 3140}, {"loss": 1.6341, "grad_norm": 0.41857317090034485, "learning_rate": 0.0002, "epoch": 2.3533806499813226, "step": 3150}, {"loss": 1.6087, "grad_norm": 0.4197218418121338, "learning_rate": 0.0002, "epoch": 2.3608516996638027, "step": 3160}, {"loss": 1.6572, "grad_norm": 0.4260677397251129, "learning_rate": 0.0002, "epoch": 2.3683227493462833, "step": 3170}, {"loss": 1.6376, "grad_norm": 0.4209042191505432, "learning_rate": 0.0002, "epoch": 2.3757937990287634, "step": 3180}, {"loss": 1.634, "grad_norm": 0.4092234969139099, "learning_rate": 0.0002, "epoch": 2.383264848711244, "step": 3190}, {"loss": 1.6339, "grad_norm": 0.4928431510925293, "learning_rate": 0.0002, "epoch": 2.390735898393724, "step": 3200}, {"loss": 1.6015, "grad_norm": 0.49252402782440186, "learning_rate": 0.0002, "epoch": 2.3982069480762047, "step": 3210}, {"loss": 1.5773, "grad_norm": 0.4368397295475006, "learning_rate": 0.0002, "epoch": 2.4056779977586853, "step": 3220}, {"loss": 1.6629, "grad_norm": 0.46122390031814575, "learning_rate": 0.0002, "epoch": 2.4131490474411654, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4272301197052002, "learning_rate": 0.0002, "epoch": 2.420620097123646, "step": 3240}, {"loss": 1.5961, "grad_norm": 0.41480937600135803, "learning_rate": 0.0002, "epoch": 2.428091146806126, "step": 3250}, {"loss": 1.6281, "grad_norm": 0.48911941051483154, "learning_rate": 0.0002, "epoch": 2.4355621964886067, "step": 3260}, {"loss": 1.6846, "grad_norm": 0.4444098472595215, "learning_rate": 0.0002, "epoch": 2.443033246171087, "step": 3270}, {"loss": 1.6961, "grad_norm": 0.5111684799194336, "learning_rate": 0.0002, "epoch": 2.4505042958535674, "step": 3280}, {"loss": 1.6152, "grad_norm": 0.5058825016021729, "learning_rate": 0.0002, "epoch": 2.457975345536048, "step": 3290}, {"loss": 1.625, "grad_norm": 0.44173210859298706, "learning_rate": 0.0002, "epoch": 2.465446395218528, "step": 3300}, {"loss": 1.6491, "grad_norm": 0.4659745991230011, "learning_rate": 0.0002, "epoch": 2.4729174449010087, "step": 3310}, {"loss": 1.6114, "grad_norm": 0.47237497568130493, "learning_rate": 0.0002, "epoch": 2.480388494583489, "step": 3320}, {"loss": 1.6193, "grad_norm": 0.47303131222724915, "learning_rate": 0.0002, "epoch": 2.4878595442659694, "step": 3330}, {"loss": 1.7256, "grad_norm": 0.4522389769554138, "learning_rate": 0.0002, "epoch": 2.4953305939484496, "step": 3340}, {"loss": 1.6834, "grad_norm": 0.4467332363128662, "learning_rate": 0.0002, "epoch": 2.50280164363093, "step": 3350}, {"loss": 1.6108, "grad_norm": 0.4413762092590332, "learning_rate": 0.0002, "epoch": 2.5102726933134107, "step": 3360}, {"loss": 1.537, "grad_norm": 0.495514452457428, "learning_rate": 0.0002, "epoch": 2.517743742995891, "step": 3370}, {"loss": 1.5839, "grad_norm": 0.4429773986339569, "learning_rate": 0.0002, "epoch": 2.5252147926783715, "step": 3380}, {"loss": 1.6522, "grad_norm": 0.4589079022407532, "learning_rate": 0.0002, "epoch": 2.5326858423608516, "step": 3390}, {"loss": 1.6529, "grad_norm": 0.4683997333049774, "learning_rate": 0.0002, "epoch": 2.540156892043332, "step": 3400}, {"loss": 1.6745, "grad_norm": 0.4651731252670288, "learning_rate": 0.0002, "epoch": 2.5476279417258123, "step": 3410}, {"loss": 1.5918, "grad_norm": 0.45818084478378296, "learning_rate": 0.0002, "epoch": 2.555098991408293, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.45209529995918274, "learning_rate": 0.0002, "epoch": 2.5625700410907735, "step": 3430}, {"loss": 1.5606, "grad_norm": 0.4344733655452728, "learning_rate": 0.0002, "epoch": 2.5700410907732536, "step": 3440}, {"loss": 1.6748, "grad_norm": 0.47435566782951355, "learning_rate": 0.0002, "epoch": 2.577512140455734, "step": 3450}, {"loss": 1.6237, "grad_norm": 0.43841999769210815, "learning_rate": 0.0002, "epoch": 2.5849831901382143, "step": 3460}, {"loss": 1.7207, "grad_norm": 0.4323869049549103, "learning_rate": 0.0002, "epoch": 2.592454239820695, "step": 3470}, {"loss": 1.5494, "grad_norm": 0.44355881214141846, "learning_rate": 0.0002, "epoch": 2.599925289503175, "step": 3480}, {"loss": 1.665, "grad_norm": 0.45847779512405396, "learning_rate": 0.0002, "epoch": 2.6073963391856556, "step": 3490}, {"loss": 1.6006, "grad_norm": 0.4411061704158783, "learning_rate": 0.0002, "epoch": 2.614867388868136, "step": 3500}, {"loss": 1.5868, "grad_norm": 0.4446796178817749, "learning_rate": 0.0002, "epoch": 2.6223384385506163, "step": 3510}, {"loss": 1.5946, "grad_norm": 0.41969653964042664, "learning_rate": 0.0002, "epoch": 2.629809488233097, "step": 3520}, {"loss": 1.6798, "grad_norm": 0.5263747572898865, "learning_rate": 0.0002, "epoch": 2.637280537915577, "step": 3530}, {"loss": 1.6309, "grad_norm": 0.47719451785087585, "learning_rate": 0.0002, "epoch": 2.6447515875980576, "step": 3540}, {"loss": 1.7024, "grad_norm": 0.46574118733406067, "learning_rate": 0.0002, "epoch": 2.6522226372805378, "step": 3550}, {"loss": 1.618, "grad_norm": 0.46867135167121887, "learning_rate": 0.0002, "epoch": 2.6596936869630183, "step": 3560}, {"loss": 1.5885, "grad_norm": 0.4441198706626892, "learning_rate": 0.0002, "epoch": 2.667164736645499, "step": 3570}, {"loss": 1.6426, "grad_norm": 0.4871319830417633, "learning_rate": 0.0002, "epoch": 2.674635786327979, "step": 3580}, {"loss": 1.6575, "grad_norm": 0.43900373578071594, "learning_rate": 0.0002, "epoch": 2.6821068360104596, "step": 3590}, {"loss": 1.6071, "grad_norm": 0.42509549856185913, "learning_rate": 0.0002, "epoch": 2.6895778856929398, "step": 3600}, {"loss": 1.5651, "grad_norm": 0.4691086709499359, "learning_rate": 0.0002, "epoch": 2.6970489353754203, "step": 3610}, {"loss": 1.5491, "grad_norm": 0.46318942308425903, "learning_rate": 0.0002, "epoch": 2.7045199850579005, "step": 3620}, {"loss": 1.5422, "grad_norm": 0.44631096720695496, "learning_rate": 0.0002, "epoch": 2.711991034740381, "step": 3630}, {"loss": 1.6831, "grad_norm": 0.42315489053726196, "learning_rate": 0.0002, "epoch": 2.7194620844228616, "step": 3640}, {"loss": 1.6008, "grad_norm": 0.4971241056919098, "learning_rate": 0.0002, "epoch": 2.7269331341053418, "step": 3650}, {"loss": 1.6042, "grad_norm": 0.4578486382961273, "learning_rate": 0.0002, "epoch": 2.7344041837878224, "step": 3660}, {"loss": 1.6076, "grad_norm": 0.46584776043891907, "learning_rate": 0.0002, "epoch": 2.7418752334703025, "step": 3670}, {"loss": 1.6809, "grad_norm": 0.4951731264591217, "learning_rate": 0.0002, "epoch": 2.749346283152783, "step": 3680}, {"loss": 1.6226, "grad_norm": 0.4935225546360016, "learning_rate": 0.0002, "epoch": 2.756817332835263, "step": 3690}, {"loss": 1.5878, "grad_norm": 0.41805586218833923, "learning_rate": 0.0002, "epoch": 2.764288382517744, "step": 3700}, {"loss": 1.7173, "grad_norm": 0.4417555630207062, "learning_rate": 0.0002, "epoch": 2.7717594322002244, "step": 3710}, {"loss": 1.6398, "grad_norm": 0.48229655623435974, "learning_rate": 0.0002, "epoch": 2.7792304818827045, "step": 3720}, {"loss": 1.6074, "grad_norm": 0.48562315106391907, "learning_rate": 0.0002, "epoch": 2.786701531565185, "step": 3730}, {"loss": 1.607, "grad_norm": 0.4473940432071686, "learning_rate": 0.0002, "epoch": 2.794172581247665, "step": 3740}, {"loss": 1.6065, "grad_norm": 0.4626813232898712, "learning_rate": 0.0002, "epoch": 2.801643630930146, "step": 3750}, {"loss": 1.6296, "grad_norm": 0.4339792728424072, "learning_rate": 0.0002, "epoch": 2.809114680612626, "step": 3760}, {"loss": 1.6815, "grad_norm": 0.5250858068466187, "learning_rate": 0.0002, "epoch": 2.8165857302951065, "step": 3770}, {"loss": 1.6644, "grad_norm": 0.4537523090839386, "learning_rate": 0.0002, "epoch": 2.824056779977587, "step": 3780}, {"loss": 1.6535, "grad_norm": 0.5646113157272339, "learning_rate": 0.0002, "epoch": 2.831527829660067, "step": 3790}, {"loss": 1.5712, "grad_norm": 0.44243332743644714, "learning_rate": 0.0002, "epoch": 2.8389988793425474, "step": 3800}, {"loss": 1.6478, "grad_norm": 0.4585791826248169, "learning_rate": 0.0002, "epoch": 2.846469929025028, "step": 3810}, {"loss": 1.6854, "grad_norm": 0.489702045917511, "learning_rate": 0.0002, "epoch": 2.8539409787075085, "step": 3820}, {"loss": 1.7066, "grad_norm": 0.502470850944519, "learning_rate": 0.0002, "epoch": 2.8614120283899886, "step": 3830}, {"loss": 1.5785, "grad_norm": 0.4395960867404938, "learning_rate": 0.0002, "epoch": 2.8688830780724692, "step": 3840}, {"loss": 1.6434, "grad_norm": 0.4348670244216919, "learning_rate": 0.0002, "epoch": 2.87635412775495, "step": 3850}, {"loss": 1.6163, "grad_norm": 0.48852720856666565, "learning_rate": 0.0002, "epoch": 2.88382517743743, "step": 3860}, {"loss": 1.5916, "grad_norm": 0.45317450165748596, "learning_rate": 0.0002, "epoch": 2.89129622711991, "step": 3870}, {"loss": 1.6486, "grad_norm": 0.4732758700847626, "learning_rate": 0.0002, "epoch": 2.8987672768023907, "step": 3880}, {"loss": 1.6758, "grad_norm": 0.45238012075424194, "learning_rate": 0.0002, "epoch": 2.9062383264848712, "step": 3890}, {"loss": 1.6228, "grad_norm": 0.48838064074516296, "learning_rate": 0.0002, "epoch": 2.9137093761673514, "step": 3900}, {"loss": 1.658, "grad_norm": 0.43496349453926086, "learning_rate": 0.0002, "epoch": 2.921180425849832, "step": 3910}, {"loss": 1.7063, "grad_norm": 0.47963935136795044, "learning_rate": 0.0002, "epoch": 2.9286514755323125, "step": 3920}, {"loss": 1.6553, "grad_norm": 0.4544987976551056, "learning_rate": 0.0002, "epoch": 2.9361225252147927, "step": 3930}, {"loss": 1.6192, "grad_norm": 0.4622892141342163, "learning_rate": 0.0002, "epoch": 2.943593574897273, "step": 3940}, {"loss": 1.6178, "grad_norm": 0.47026222944259644, "learning_rate": 0.0002, "epoch": 2.9510646245797534, "step": 3950}, {"loss": 1.6612, "grad_norm": 0.4549552798271179, "learning_rate": 0.0002, "epoch": 2.958535674262234, "step": 3960}, {"loss": 1.6458, "grad_norm": 0.46647515892982483, "learning_rate": 0.0002, "epoch": 2.966006723944714, "step": 3970}, {"loss": 1.6051, "grad_norm": 0.45095112919807434, "learning_rate": 0.0002, "epoch": 2.9734777736271947, "step": 3980}, {"loss": 1.6471, "grad_norm": 0.4690017104148865, "learning_rate": 0.0002, "epoch": 2.9809488233096753, "step": 3990}, {"loss": 1.6061, "grad_norm": 0.4603444039821625, "learning_rate": 0.0002, "epoch": 2.9884198729921554, "step": 4000}, {"loss": 1.6431, "grad_norm": 0.4743294417858124, "learning_rate": 0.0002, "epoch": 2.9958909226746355, "step": 4010}, {"eval_loss": 1.8252571821212769, "eval_runtime": 38.7853, "eval_samples_per_second": 13.278, "eval_steps_per_second": 1.676, "epoch": 2.999626447515876, "step": 4015}, {"loss": 1.6512, "grad_norm": 0.4919724464416504, "learning_rate": 0.0002, "epoch": 3.003361972357116, "step": 4020}, {"loss": 1.5354, "grad_norm": 0.4747185707092285, "learning_rate": 0.0002, "epoch": 3.0108330220395967, "step": 4030}, {"loss": 1.568, "grad_norm": 0.4797595143318176, "learning_rate": 0.0002, "epoch": 3.018304071722077, "step": 4040}, {"loss": 1.5194, "grad_norm": 0.5450999140739441, "learning_rate": 0.0002, "epoch": 3.0257751214045574, "step": 4050}, {"loss": 1.5065, "grad_norm": 0.49058812856674194, "learning_rate": 0.0002, "epoch": 3.0332461710870375, "step": 4060}, {"loss": 1.4884, "grad_norm": 0.5219563841819763, "learning_rate": 0.0002, "epoch": 3.040717220769518, "step": 4070}, {"loss": 1.4742, "grad_norm": 0.515628457069397, "learning_rate": 0.0002, "epoch": 3.0481882704519987, "step": 4080}, {"loss": 1.5313, "grad_norm": 0.6145984530448914, "learning_rate": 0.0002, "epoch": 3.055659320134479, "step": 4090}, {"loss": 1.4989, "grad_norm": 0.6067144274711609, "learning_rate": 0.0002, "epoch": 3.0631303698169594, "step": 4100}, {"loss": 1.528, "grad_norm": 0.5773133039474487, "learning_rate": 0.0002, "epoch": 3.0706014194994395, "step": 4110}, {"loss": 1.5374, "grad_norm": 0.6894241571426392, "learning_rate": 0.0002, "epoch": 3.07807246918192, "step": 4120}, {"loss": 1.5422, "grad_norm": 0.6422514915466309, "learning_rate": 0.0002, "epoch": 3.0855435188644003, "step": 4130}, {"loss": 1.4724, "grad_norm": 0.6119855046272278, "learning_rate": 0.0002, "epoch": 3.093014568546881, "step": 4140}, {"loss": 1.5361, "grad_norm": 0.5847280025482178, "learning_rate": 0.0002, "epoch": 3.1004856182293614, "step": 4150}, {"loss": 1.5151, "grad_norm": 0.5401515960693359, "learning_rate": 0.0002, "epoch": 3.1079566679118416, "step": 4160}, {"loss": 1.502, "grad_norm": 0.6501587629318237, "learning_rate": 0.0002, "epoch": 3.115427717594322, "step": 4170}, {"loss": 1.4952, "grad_norm": 0.5988039374351501, "learning_rate": 0.0002, "epoch": 3.1228987672768023, "step": 4180}, {"loss": 1.5287, "grad_norm": 0.4982665181159973, "learning_rate": 0.0002, "epoch": 3.130369816959283, "step": 4190}, {"loss": 1.5078, "grad_norm": 0.5548039078712463, "learning_rate": 0.0002, "epoch": 3.137840866641763, "step": 4200}, {"loss": 1.4904, "grad_norm": 0.5920777320861816, "learning_rate": 0.0002, "epoch": 3.1453119163242436, "step": 4210}, {"loss": 1.442, "grad_norm": 0.6965190172195435, "learning_rate": 0.0002, "epoch": 3.152782966006724, "step": 4220}, {"loss": 1.557, "grad_norm": 0.5196244716644287, "learning_rate": 0.0002, "epoch": 3.1602540156892043, "step": 4230}, {"loss": 1.5706, "grad_norm": 0.6942682266235352, "learning_rate": 0.0002, "epoch": 3.167725065371685, "step": 4240}, {"loss": 1.5407, "grad_norm": 0.5765156149864197, "learning_rate": 0.0002, "epoch": 3.175196115054165, "step": 4250}, {"loss": 1.4963, "grad_norm": 0.5801976919174194, "learning_rate": 0.0002, "epoch": 3.1826671647366456, "step": 4260}, {"loss": 1.4988, "grad_norm": 0.6260752081871033, "learning_rate": 0.0002, "epoch": 3.1901382144191257, "step": 4270}, {"loss": 1.5074, "grad_norm": 0.6610770225524902, "learning_rate": 0.0002, "epoch": 3.1976092641016063, "step": 4280}, {"loss": 1.4657, "grad_norm": 0.5762143135070801, "learning_rate": 0.0002, "epoch": 3.205080313784087, "step": 4290}, {"loss": 1.5181, "grad_norm": 0.5926990509033203, "learning_rate": 0.0002, "epoch": 3.212551363466567, "step": 4300}, {"loss": 1.5492, "grad_norm": 0.7373854517936707, "learning_rate": 0.0002, "epoch": 3.2200224131490476, "step": 4310}, {"loss": 1.4648, "grad_norm": 0.5963311195373535, "learning_rate": 0.0002, "epoch": 3.2274934628315277, "step": 4320}, {"loss": 1.5262, "grad_norm": 0.5754616856575012, "learning_rate": 0.0002, "epoch": 3.2349645125140083, "step": 4330}, {"loss": 1.4767, "grad_norm": 0.6116095781326294, "learning_rate": 0.0002, "epoch": 3.2424355621964884, "step": 4340}, {"loss": 1.5008, "grad_norm": 0.6001536846160889, "learning_rate": 0.0002, "epoch": 3.249906611878969, "step": 4350}, {"loss": 1.5738, "grad_norm": 0.5270227789878845, "learning_rate": 0.0002, "epoch": 3.257377661561449, "step": 4360}, {"loss": 1.5235, "grad_norm": 0.6666602492332458, "learning_rate": 0.0002, "epoch": 3.2648487112439297, "step": 4370}, {"loss": 1.5665, "grad_norm": 0.520310640335083, "learning_rate": 0.0002, "epoch": 3.2723197609264103, "step": 4380}, {"loss": 1.542, "grad_norm": 0.5165975093841553, "learning_rate": 0.0002, "epoch": 3.2797908106088904, "step": 4390}, {"loss": 1.4746, "grad_norm": 0.6080228686332703, "learning_rate": 0.0002, "epoch": 3.287261860291371, "step": 4400}, {"loss": 1.4901, "grad_norm": 0.670122504234314, "learning_rate": 0.0002, "epoch": 3.294732909973851, "step": 4410}, {"loss": 1.4677, "grad_norm": 0.6019457578659058, "learning_rate": 0.0002, "epoch": 3.3022039596563317, "step": 4420}, {"loss": 1.4249, "grad_norm": 0.5519300103187561, "learning_rate": 0.0002, "epoch": 3.309675009338812, "step": 4430}, {"loss": 1.555, "grad_norm": 0.5958521962165833, "learning_rate": 0.0002, "epoch": 3.3171460590212924, "step": 4440}, {"loss": 1.5067, "grad_norm": 0.5552705526351929, "learning_rate": 0.0002, "epoch": 3.324617108703773, "step": 4450}, {"loss": 1.5926, "grad_norm": 0.6583784818649292, "learning_rate": 0.0002, "epoch": 3.332088158386253, "step": 4460}, {"loss": 1.4206, "grad_norm": 0.5815939903259277, "learning_rate": 0.0002, "epoch": 3.3395592080687337, "step": 4470}, {"loss": 1.5942, "grad_norm": 1.3342205286026, "learning_rate": 0.0002, "epoch": 3.347030257751214, "step": 4480}, {"loss": 1.484, "grad_norm": 0.6341500878334045, "learning_rate": 0.0002, "epoch": 3.3545013074336945, "step": 4490}, {"loss": 1.5219, "grad_norm": 0.6384079456329346, "learning_rate": 0.0002, "epoch": 3.3619723571161746, "step": 4500}, {"loss": 1.5222, "grad_norm": 0.6098346710205078, "learning_rate": 0.0002, "epoch": 3.369443406798655, "step": 4510}, {"loss": 1.5475, "grad_norm": 0.5958296656608582, "learning_rate": 0.0002, "epoch": 3.3769144564811358, "step": 4520}, {"loss": 1.5171, "grad_norm": 0.6157881617546082, "learning_rate": 0.0002, "epoch": 3.384385506163616, "step": 4530}, {"loss": 1.569, "grad_norm": 0.5671007037162781, "learning_rate": 0.0002, "epoch": 3.3918565558460965, "step": 4540}, {"loss": 1.604, "grad_norm": 0.6203294992446899, "learning_rate": 0.0002, "epoch": 3.3993276055285766, "step": 4550}, {"loss": 1.5364, "grad_norm": 0.6743317246437073, "learning_rate": 0.0002, "epoch": 3.406798655211057, "step": 4560}, {"loss": 1.5034, "grad_norm": 0.731765627861023, "learning_rate": 0.0002, "epoch": 3.4142697048935373, "step": 4570}, {"loss": 1.4585, "grad_norm": 0.6285187602043152, "learning_rate": 0.0002, "epoch": 3.421740754576018, "step": 4580}, {"loss": 1.5296, "grad_norm": 0.612680196762085, "learning_rate": 0.0002, "epoch": 3.4292118042584985, "step": 4590}, {"loss": 1.5577, "grad_norm": 0.6413681507110596, "learning_rate": 0.0002, "epoch": 3.4366828539409786, "step": 4600}, {"loss": 1.5026, "grad_norm": 0.6240990161895752, "learning_rate": 0.0002, "epoch": 3.444153903623459, "step": 4610}, {"loss": 1.5887, "grad_norm": 0.5095735192298889, "learning_rate": 0.0002, "epoch": 3.4516249533059393, "step": 4620}, {"loss": 1.4906, "grad_norm": 0.5699611902236938, "learning_rate": 0.0002, "epoch": 3.45909600298842, "step": 4630}, {"loss": 1.5176, "grad_norm": 0.7289775609970093, "learning_rate": 0.0002, "epoch": 3.4665670526709, "step": 4640}, {"loss": 1.5467, "grad_norm": 0.6211609840393066, "learning_rate": 0.0002, "epoch": 3.4740381023533806, "step": 4650}, {"loss": 1.533, "grad_norm": 0.5714802145957947, "learning_rate": 0.0002, "epoch": 3.481509152035861, "step": 4660}, {"loss": 1.5096, "grad_norm": 0.6287049651145935, "learning_rate": 0.0002, "epoch": 3.4889802017183413, "step": 4670}, {"loss": 1.4212, "grad_norm": 0.5480595827102661, "learning_rate": 0.0002, "epoch": 3.496451251400822, "step": 4680}, {"loss": 1.4746, "grad_norm": 0.5683253407478333, "learning_rate": 0.0002, "epoch": 3.503922301083302, "step": 4690}, {"loss": 1.5012, "grad_norm": 0.601140558719635, "learning_rate": 0.0002, "epoch": 3.5113933507657826, "step": 4700}, {"loss": 1.5383, "grad_norm": 0.5344498157501221, "learning_rate": 0.0002, "epoch": 3.5188644004482628, "step": 4710}, {"loss": 1.5428, "grad_norm": 0.5739690661430359, "learning_rate": 0.0002, "epoch": 3.5263354501307433, "step": 4720}, {"loss": 1.5589, "grad_norm": 0.5640085935592651, "learning_rate": 0.0002, "epoch": 3.533806499813224, "step": 4730}, {"loss": 1.487, "grad_norm": 0.5967805981636047, "learning_rate": 0.0002, "epoch": 3.541277549495704, "step": 4740}, {"loss": 1.5461, "grad_norm": 0.6138835549354553, "learning_rate": 0.0002, "epoch": 3.5487485991781846, "step": 4750}, {"loss": 1.5502, "grad_norm": 0.6779900193214417, "learning_rate": 0.0002, "epoch": 3.5562196488606648, "step": 4760}, {"loss": 1.4917, "grad_norm": 0.6122010350227356, "learning_rate": 0.0002, "epoch": 3.5636906985431454, "step": 4770}, {"loss": 1.5405, "grad_norm": 0.5685241222381592, "learning_rate": 0.0002, "epoch": 3.5711617482256255, "step": 4780}, {"loss": 1.5427, "grad_norm": 0.604583203792572, "learning_rate": 0.0002, "epoch": 3.578632797908106, "step": 4790}, {"loss": 1.4514, "grad_norm": 0.651165246963501, "learning_rate": 0.0002, "epoch": 3.5861038475905866, "step": 4800}, {"loss": 1.4109, "grad_norm": 0.6398511528968811, "learning_rate": 0.0002, "epoch": 3.593574897273067, "step": 4810}, {"loss": 1.4261, "grad_norm": 0.6444641351699829, "learning_rate": 0.0002, "epoch": 3.6010459469555474, "step": 4820}, {"loss": 1.5274, "grad_norm": 0.6018481850624084, "learning_rate": 0.0002, "epoch": 3.6085169966380275, "step": 4830}, {"loss": 1.4647, "grad_norm": 0.6025291085243225, "learning_rate": 0.0002, "epoch": 3.615988046320508, "step": 4840}, {"loss": 1.5609, "grad_norm": 0.6810156106948853, "learning_rate": 0.0002, "epoch": 3.623459096002988, "step": 4850}, {"loss": 1.5299, "grad_norm": 0.6408044695854187, "learning_rate": 0.0002, "epoch": 3.630930145685469, "step": 4860}, {"loss": 1.5366, "grad_norm": 0.5608272552490234, "learning_rate": 0.0002, "epoch": 3.6384011953679494, "step": 4870}, {"loss": 1.5188, "grad_norm": 0.6136814951896667, "learning_rate": 0.0002, "epoch": 3.6458722450504295, "step": 4880}, {"loss": 1.5021, "grad_norm": 0.5927900075912476, "learning_rate": 0.0002, "epoch": 3.65334329473291, "step": 4890}, {"loss": 1.6084, "grad_norm": 0.5336901545524597, "learning_rate": 0.0002, "epoch": 3.66081434441539, "step": 4900}, {"loss": 1.5701, "grad_norm": 0.7823320627212524, "learning_rate": 0.0002, "epoch": 3.668285394097871, "step": 4910}, {"loss": 1.4881, "grad_norm": 0.6703504323959351, "learning_rate": 0.0002, "epoch": 3.675756443780351, "step": 4920}, {"loss": 1.5332, "grad_norm": 0.6061160564422607, "learning_rate": 0.0002, "epoch": 3.6832274934628315, "step": 4930}, {"loss": 1.5405, "grad_norm": 0.6237227916717529, "learning_rate": 0.0002, "epoch": 3.690698543145312, "step": 4940}, {"loss": 1.497, "grad_norm": 0.5985278487205505, "learning_rate": 0.0002, "epoch": 3.6981695928277922, "step": 4950}, {"loss": 1.5132, "grad_norm": 0.6483839750289917, "learning_rate": 0.0002, "epoch": 3.705640642510273, "step": 4960}, {"loss": 1.5338, "grad_norm": 0.5788805484771729, "learning_rate": 0.0002, "epoch": 3.713111692192753, "step": 4970}, {"loss": 1.5258, "grad_norm": 0.5609974265098572, "learning_rate": 0.0002, "epoch": 3.7205827418752335, "step": 4980}, {"loss": 1.4759, "grad_norm": 0.5681300759315491, "learning_rate": 0.0002, "epoch": 3.7280537915577137, "step": 4990}, {"loss": 1.6018, "grad_norm": 0.5860186219215393, "learning_rate": 0.0002, "epoch": 3.7355248412401942, "step": 5000}, {"loss": 1.58, "grad_norm": 0.5718157291412354, "learning_rate": 0.0002, "epoch": 3.742995890922675, "step": 5010}, {"loss": 1.5834, "grad_norm": 0.6173721551895142, "learning_rate": 0.0002, "epoch": 3.750466940605155, "step": 5020}, {"loss": 1.5617, "grad_norm": 0.629152238368988, "learning_rate": 0.0002, "epoch": 3.7579379902876355, "step": 5030}, {"loss": 1.519, "grad_norm": 0.5666284561157227, "learning_rate": 0.0002, "epoch": 3.7654090399701157, "step": 5040}, {"loss": 1.5329, "grad_norm": 0.6053005456924438, "learning_rate": 0.0002, "epoch": 3.7728800896525962, "step": 5050}, {"loss": 1.5404, "grad_norm": 0.5870583057403564, "learning_rate": 0.0002, "epoch": 3.7803511393350764, "step": 5060}, {"loss": 1.4444, "grad_norm": 0.5422009229660034, "learning_rate": 0.0002, "epoch": 3.787822189017557, "step": 5070}, {"loss": 1.5308, "grad_norm": 0.5396918058395386, "learning_rate": 0.0002, "epoch": 3.7952932387000375, "step": 5080}, {"loss": 1.464, "grad_norm": 0.5544713139533997, "learning_rate": 0.0002, "epoch": 3.8027642883825177, "step": 5090}, {"loss": 1.4752, "grad_norm": 0.5983749628067017, "learning_rate": 0.0002, "epoch": 3.8102353380649983, "step": 5100}, {"loss": 1.4972, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 3.8177063877474784, "step": 5110}, {"loss": 1.5471, "grad_norm": 0.5436882376670837, "learning_rate": 0.0002, "epoch": 3.825177437429959, "step": 5120}, {"loss": 1.5118, "grad_norm": 0.5453617572784424, "learning_rate": 0.0002, "epoch": 3.832648487112439, "step": 5130}, {"loss": 1.5732, "grad_norm": 0.6269069314002991, "learning_rate": 0.0002, "epoch": 3.8401195367949197, "step": 5140}, {"loss": 1.4959, "grad_norm": 0.6189185380935669, "learning_rate": 0.0002, "epoch": 3.8475905864774003, "step": 5150}, {"loss": 1.4999, "grad_norm": 0.6653388142585754, "learning_rate": 0.0002, "epoch": 3.8550616361598804, "step": 5160}, {"loss": 1.5075, "grad_norm": 0.5771768689155579, "learning_rate": 0.0002, "epoch": 3.862532685842361, "step": 5170}, {"loss": 1.5545, "grad_norm": 0.6052790880203247, "learning_rate": 0.0002, "epoch": 3.870003735524841, "step": 5180}, {"loss": 1.4987, "grad_norm": 0.6572316884994507, "learning_rate": 0.0002, "epoch": 3.8774747852073217, "step": 5190}, {"loss": 1.5241, "grad_norm": 0.670576810836792, "learning_rate": 0.0002, "epoch": 3.884945834889802, "step": 5200}, {"loss": 1.4777, "grad_norm": 0.5728798508644104, "learning_rate": 0.0002, "epoch": 3.8924168845722824, "step": 5210}, {"loss": 1.5351, "grad_norm": 0.6340774297714233, "learning_rate": 0.0002, "epoch": 3.899887934254763, "step": 5220}, {"loss": 1.5081, "grad_norm": 0.5981315970420837, "learning_rate": 0.0002, "epoch": 3.907358983937243, "step": 5230}, {"loss": 1.4875, "grad_norm": 0.6212025880813599, "learning_rate": 0.0002, "epoch": 3.9148300336197237, "step": 5240}, {"loss": 1.5545, "grad_norm": 0.6202296018600464, "learning_rate": 0.0002, "epoch": 3.922301083302204, "step": 5250}, {"loss": 1.5765, "grad_norm": 0.6159142255783081, "learning_rate": 0.0002, "epoch": 3.9297721329846844, "step": 5260}, {"loss": 1.4938, "grad_norm": 0.6519438624382019, "learning_rate": 0.0002, "epoch": 3.9372431826671646, "step": 5270}, {"loss": 1.4859, "grad_norm": 0.539813756942749, "learning_rate": 0.0002, "epoch": 3.944714232349645, "step": 5280}, {"loss": 1.5921, "grad_norm": 0.6443665027618408, "learning_rate": 0.0002, "epoch": 3.9521852820321257, "step": 5290}, {"loss": 1.5153, "grad_norm": 0.6635757684707642, "learning_rate": 0.0002, "epoch": 3.959656331714606, "step": 5300}, {"loss": 1.5485, "grad_norm": 0.589363157749176, "learning_rate": 0.0002, "epoch": 3.9671273813970864, "step": 5310}, {"loss": 1.5498, "grad_norm": 0.5788735747337341, "learning_rate": 0.0002, "epoch": 3.9745984310795666, "step": 5320}, {"loss": 1.5607, "grad_norm": 0.5976864695549011, "learning_rate": 0.0002, "epoch": 3.982069480762047, "step": 5330}, {"loss": 1.5302, "grad_norm": 0.6624067425727844, "learning_rate": 0.0002, "epoch": 3.9895405304445273, "step": 5340}, {"loss": 1.5904, "grad_norm": 0.6738956570625305, "learning_rate": 0.0002, "epoch": 3.997011580127008, "step": 5350}, {"eval_loss": 1.868006944656372, "eval_runtime": 38.5153, "eval_samples_per_second": 13.371, "eval_steps_per_second": 1.688, "epoch": 4.0, "step": 5354}, {"loss": 1.4535, "grad_norm": 0.6023468971252441, "learning_rate": 0.0002, "epoch": 4.004482629809488, "step": 5360}, {"loss": 1.3987, "grad_norm": 0.8589285612106323, "learning_rate": 0.0002, "epoch": 4.011953679491969, "step": 5370}, {"loss": 1.3952, "grad_norm": 0.7477491497993469, "learning_rate": 0.0002, "epoch": 4.019424729174449, "step": 5380}, {"loss": 1.3745, "grad_norm": 0.7601922154426575, "learning_rate": 0.0002, "epoch": 4.02689577885693, "step": 5390}, {"loss": 1.4133, "grad_norm": 0.8115614056587219, "learning_rate": 0.0002, "epoch": 4.03436682853941, "step": 5400}, {"loss": 1.3748, "grad_norm": 0.669925332069397, "learning_rate": 0.0002, "epoch": 4.04183787822189, "step": 5410}, {"loss": 1.2835, "grad_norm": 0.8091904520988464, "learning_rate": 0.0002, "epoch": 4.04930892790437, "step": 5420}, {"loss": 1.3615, "grad_norm": 0.709405779838562, "learning_rate": 0.0002, "epoch": 4.056779977586851, "step": 5430}, {"loss": 1.3558, "grad_norm": 1.0006179809570312, "learning_rate": 0.0002, "epoch": 4.064251027269331, "step": 5440}, {"loss": 1.3491, "grad_norm": 0.7017965912818909, "learning_rate": 0.0002, "epoch": 4.071722076951811, "step": 5450}, {"loss": 1.3642, "grad_norm": 0.8991572260856628, "learning_rate": 0.0002, "epoch": 4.0791931266342925, "step": 5460}, {"loss": 1.392, "grad_norm": 0.9064797759056091, "learning_rate": 0.0002, "epoch": 4.086664176316773, "step": 5470}, {"loss": 1.3425, "grad_norm": 0.7981749176979065, "learning_rate": 0.0002, "epoch": 4.094135225999253, "step": 5480}, {"loss": 1.3826, "grad_norm": 0.7280883193016052, "learning_rate": 0.0002, "epoch": 4.101606275681733, "step": 5490}, {"loss": 1.3275, "grad_norm": 0.7419600486755371, "learning_rate": 0.0002, "epoch": 4.109077325364214, "step": 5500}, {"loss": 1.3199, "grad_norm": 0.8019949197769165, "learning_rate": 0.0002, "epoch": 4.116548375046694, "step": 5510}, {"loss": 1.3133, "grad_norm": 0.7501229047775269, "learning_rate": 0.0002, "epoch": 4.124019424729174, "step": 5520}, {"loss": 1.4432, "grad_norm": 0.8166249990463257, "learning_rate": 0.0002, "epoch": 4.131490474411655, "step": 5530}, {"loss": 1.3901, "grad_norm": 0.9728496074676514, "learning_rate": 0.0002, "epoch": 4.138961524094135, "step": 5540}, {"loss": 1.3538, "grad_norm": 0.7590922117233276, "learning_rate": 0.0002, "epoch": 4.1464325737766154, "step": 5550}, {"loss": 1.4368, "grad_norm": 0.7759010791778564, "learning_rate": 0.0002, "epoch": 4.153903623459096, "step": 5560}, {"loss": 1.3635, "grad_norm": 0.9057986736297607, "learning_rate": 0.0002, "epoch": 4.161374673141577, "step": 5570}, {"loss": 1.4152, "grad_norm": 0.8853937983512878, "learning_rate": 0.0002, "epoch": 4.168845722824057, "step": 5580}, {"loss": 1.3633, "grad_norm": 0.7070684432983398, "learning_rate": 0.0002, "epoch": 4.176316772506537, "step": 5590}, {"loss": 1.3218, "grad_norm": 0.7649410963058472, "learning_rate": 0.0002, "epoch": 4.183787822189018, "step": 5600}, {"loss": 1.3857, "grad_norm": 1.2048029899597168, "learning_rate": 0.0002, "epoch": 4.191258871871498, "step": 5610}, {"loss": 1.3629, "grad_norm": 0.7986605763435364, "learning_rate": 0.0002, "epoch": 4.198729921553978, "step": 5620}, {"loss": 1.3995, "grad_norm": 0.8151885867118835, "learning_rate": 0.0002, "epoch": 4.206200971236458, "step": 5630}, {"loss": 1.3782, "grad_norm": 0.7719064354896545, "learning_rate": 0.0002, "epoch": 4.213672020918939, "step": 5640}, {"loss": 1.3852, "grad_norm": 0.8422448039054871, "learning_rate": 0.0002, "epoch": 4.2211430706014195, "step": 5650}, {"loss": 1.3321, "grad_norm": 0.7017164826393127, "learning_rate": 0.0002, "epoch": 4.2286141202839, "step": 5660}, {"loss": 1.4105, "grad_norm": 0.8559677600860596, "learning_rate": 0.0002, "epoch": 4.236085169966381, "step": 5670}, {"loss": 1.3701, "grad_norm": 0.8216157555580139, "learning_rate": 0.0002, "epoch": 4.243556219648861, "step": 5680}, {"loss": 1.3565, "grad_norm": 0.7681755423545837, "learning_rate": 0.0002, "epoch": 4.251027269331341, "step": 5690}, {"loss": 1.3806, "grad_norm": 0.811665952205658, "learning_rate": 0.0002, "epoch": 4.258498319013821, "step": 5700}, {"loss": 1.4161, "grad_norm": 0.7242204546928406, "learning_rate": 0.0002, "epoch": 4.265969368696302, "step": 5710}, {"loss": 1.2958, "grad_norm": 0.7570181488990784, "learning_rate": 0.0002, "epoch": 4.273440418378782, "step": 5720}, {"loss": 1.4265, "grad_norm": 0.8951969146728516, "learning_rate": 0.0002, "epoch": 4.280911468061262, "step": 5730}, {"loss": 1.3895, "grad_norm": 0.7222902178764343, "learning_rate": 0.0002, "epoch": 4.288382517743743, "step": 5740}, {"loss": 1.4155, "grad_norm": 0.8508469462394714, "learning_rate": 0.0002, "epoch": 4.2958535674262235, "step": 5750}, {"loss": 1.365, "grad_norm": 0.7215430736541748, "learning_rate": 0.0002, "epoch": 4.303324617108704, "step": 5760}, {"loss": 1.4472, "grad_norm": 0.8774884939193726, "learning_rate": 0.0002, "epoch": 4.310795666791184, "step": 5770}, {"loss": 1.427, "grad_norm": 0.8354552984237671, "learning_rate": 0.0002, "epoch": 4.318266716473665, "step": 5780}, {"loss": 1.3222, "grad_norm": 0.6938814520835876, "learning_rate": 0.0002, "epoch": 4.325737766156145, "step": 5790}, {"loss": 1.3589, "grad_norm": 0.78675377368927, "learning_rate": 0.0002, "epoch": 4.333208815838625, "step": 5800}, {"loss": 1.3662, "grad_norm": 0.7147697806358337, "learning_rate": 0.0002, "epoch": 4.340679865521106, "step": 5810}, {"loss": 1.3597, "grad_norm": 0.7693623304367065, "learning_rate": 0.0002, "epoch": 4.348150915203586, "step": 5820}, {"loss": 1.2944, "grad_norm": 0.856517493724823, "learning_rate": 0.0002, "epoch": 4.355621964886066, "step": 5830}, {"loss": 1.4307, "grad_norm": 0.7200973033905029, "learning_rate": 0.0002, "epoch": 4.3630930145685465, "step": 5840}, {"loss": 1.442, "grad_norm": 0.743281364440918, "learning_rate": 0.0002, "epoch": 4.3705640642510275, "step": 5850}, {"loss": 1.3999, "grad_norm": 0.7627727389335632, "learning_rate": 0.0002, "epoch": 4.378035113933508, "step": 5860}, {"loss": 1.4082, "grad_norm": 0.7238836884498596, "learning_rate": 0.0002, "epoch": 4.385506163615988, "step": 5870}, {"loss": 1.4292, "grad_norm": 0.7253410816192627, "learning_rate": 0.0002, "epoch": 4.392977213298469, "step": 5880}, {"loss": 1.3774, "grad_norm": 0.8232238292694092, "learning_rate": 0.0002, "epoch": 4.400448262980949, "step": 5890}, {"loss": 1.3757, "grad_norm": 0.8778504729270935, "learning_rate": 0.0002, "epoch": 4.407919312663429, "step": 5900}, {"loss": 1.387, "grad_norm": 0.7639474868774414, "learning_rate": 0.0002, "epoch": 4.415390362345909, "step": 5910}, {"loss": 1.3862, "grad_norm": 0.7666519284248352, "learning_rate": 0.0002, "epoch": 4.42286141202839, "step": 5920}, {"loss": 1.4168, "grad_norm": 0.867132842540741, "learning_rate": 0.0002, "epoch": 4.43033246171087, "step": 5930}, {"loss": 1.4772, "grad_norm": 0.7571166753768921, "learning_rate": 0.0002, "epoch": 4.4378035113933505, "step": 5940}, {"loss": 1.4401, "grad_norm": 0.7911370992660522, "learning_rate": 0.0002, "epoch": 4.4452745610758315, "step": 5950}, {"loss": 1.4516, "grad_norm": 0.8844250440597534, "learning_rate": 0.0002, "epoch": 4.452745610758312, "step": 5960}, {"loss": 1.4109, "grad_norm": 0.7336231470108032, "learning_rate": 0.0002, "epoch": 4.460216660440792, "step": 5970}, {"loss": 1.3891, "grad_norm": 0.8162738084793091, "learning_rate": 0.0002, "epoch": 4.467687710123272, "step": 5980}, {"loss": 1.393, "grad_norm": 0.7413017153739929, "learning_rate": 0.0002, "epoch": 4.475158759805753, "step": 5990}, {"loss": 1.3712, "grad_norm": 0.7215432524681091, "learning_rate": 0.0002, "epoch": 4.482629809488233, "step": 6000}, {"loss": 1.3521, "grad_norm": 0.8943389058113098, "learning_rate": 0.0002, "epoch": 4.490100859170713, "step": 6010}, {"loss": 1.4172, "grad_norm": 0.7850823998451233, "learning_rate": 0.0002, "epoch": 4.497571908853194, "step": 6020}, {"loss": 1.3582, "grad_norm": 0.8117504119873047, "learning_rate": 0.0002, "epoch": 4.505042958535674, "step": 6030}, {"loss": 1.4272, "grad_norm": 0.8381605744361877, "learning_rate": 0.0002, "epoch": 4.5125140082181545, "step": 6040}, {"loss": 1.3829, "grad_norm": 0.7964059710502625, "learning_rate": 0.0002, "epoch": 4.519985057900635, "step": 6050}, {"loss": 1.3555, "grad_norm": 0.7935128211975098, "learning_rate": 0.0002, "epoch": 4.527456107583116, "step": 6060}, {"loss": 1.3994, "grad_norm": 0.8725124597549438, "learning_rate": 0.0002, "epoch": 4.534927157265596, "step": 6070}, {"loss": 1.3923, "grad_norm": 0.880325198173523, "learning_rate": 0.0002, "epoch": 4.542398206948076, "step": 6080}, {"loss": 1.4459, "grad_norm": 0.7220637202262878, "learning_rate": 0.0002, "epoch": 4.549869256630557, "step": 6090}, {"loss": 1.3281, "grad_norm": 0.6908547878265381, "learning_rate": 0.0002, "epoch": 4.557340306313037, "step": 6100}, {"loss": 1.437, "grad_norm": 0.797931969165802, "learning_rate": 0.0002, "epoch": 4.564811355995517, "step": 6110}, {"loss": 1.4023, "grad_norm": 0.7056134343147278, "learning_rate": 0.0002, "epoch": 4.572282405677997, "step": 6120}, {"loss": 1.3814, "grad_norm": 0.7850478887557983, "learning_rate": 0.0002, "epoch": 4.579753455360478, "step": 6130}, {"loss": 1.3579, "grad_norm": 0.8112621307373047, "learning_rate": 0.0002, "epoch": 4.5872245050429585, "step": 6140}, {"loss": 1.3523, "grad_norm": 0.7040849328041077, "learning_rate": 0.0002, "epoch": 4.594695554725439, "step": 6150}, {"loss": 1.3526, "grad_norm": 0.7214553952217102, "learning_rate": 0.0002, "epoch": 4.60216660440792, "step": 6160}, {"loss": 1.3932, "grad_norm": 0.8616511821746826, "learning_rate": 0.0002, "epoch": 4.6096376540904, "step": 6170}, {"loss": 1.4622, "grad_norm": 0.8374658226966858, "learning_rate": 0.0002, "epoch": 4.61710870377288, "step": 6180}, {"loss": 1.3703, "grad_norm": 0.6761606931686401, "learning_rate": 0.0002, "epoch": 4.62457975345536, "step": 6190}, {"loss": 1.3977, "grad_norm": 0.768028199672699, "learning_rate": 0.0002, "epoch": 4.632050803137841, "step": 6200}, {"loss": 1.3772, "grad_norm": 0.9372717142105103, "learning_rate": 0.0002, "epoch": 4.639521852820321, "step": 6210}, {"loss": 1.4098, "grad_norm": 0.7906546592712402, "learning_rate": 0.0002, "epoch": 4.646992902502801, "step": 6220}, {"loss": 1.3962, "grad_norm": 0.7376723289489746, "learning_rate": 0.0002, "epoch": 4.654463952185282, "step": 6230}, {"loss": 1.4529, "grad_norm": 0.8972630500793457, "learning_rate": 0.0002, "epoch": 4.6619350018677626, "step": 6240}, {"loss": 1.4668, "grad_norm": 0.8261756300926208, "learning_rate": 0.0002, "epoch": 4.669406051550243, "step": 6250}, {"loss": 1.3267, "grad_norm": 0.7512393593788147, "learning_rate": 0.0002, "epoch": 4.676877101232723, "step": 6260}, {"loss": 1.4278, "grad_norm": 0.7132362127304077, "learning_rate": 0.0002, "epoch": 4.684348150915204, "step": 6270}, {"loss": 1.4299, "grad_norm": 0.7690575122833252, "learning_rate": 0.0002, "epoch": 4.691819200597684, "step": 6280}, {"loss": 1.4769, "grad_norm": 0.9886258840560913, "learning_rate": 0.0002, "epoch": 4.699290250280164, "step": 6290}, {"loss": 1.4005, "grad_norm": 0.9502435922622681, "learning_rate": 0.0002, "epoch": 4.706761299962645, "step": 6300}, {"loss": 1.4319, "grad_norm": 0.702255129814148, "learning_rate": 0.0002, "epoch": 4.714232349645125, "step": 6310}, {"loss": 1.4447, "grad_norm": 0.7713103890419006, "learning_rate": 0.0002, "epoch": 4.721703399327605, "step": 6320}, {"loss": 1.4392, "grad_norm": 0.7778580784797668, "learning_rate": 0.0002, "epoch": 4.7291744490100855, "step": 6330}, {"loss": 1.4169, "grad_norm": 0.7275111079216003, "learning_rate": 0.0002, "epoch": 4.736645498692567, "step": 6340}, {"loss": 1.4429, "grad_norm": 0.7728744149208069, "learning_rate": 0.0002, "epoch": 4.744116548375047, "step": 6350}, {"loss": 1.3756, "grad_norm": 0.9724260568618774, "learning_rate": 0.0002, "epoch": 4.751587598057527, "step": 6360}, {"loss": 1.3358, "grad_norm": 0.7505622506141663, "learning_rate": 0.0002, "epoch": 4.759058647740007, "step": 6370}, {"loss": 1.379, "grad_norm": 0.7994682788848877, "learning_rate": 0.0002, "epoch": 4.766529697422488, "step": 6380}, {"loss": 1.4275, "grad_norm": 0.8432038426399231, "learning_rate": 0.0002, "epoch": 4.774000747104968, "step": 6390}, {"loss": 1.4606, "grad_norm": 0.7436022758483887, "learning_rate": 0.0002, "epoch": 4.781471796787448, "step": 6400}, {"loss": 1.3461, "grad_norm": 0.7709194421768188, "learning_rate": 0.0002, "epoch": 4.788942846469929, "step": 6410}, {"loss": 1.3715, "grad_norm": 0.8798436522483826, "learning_rate": 0.0002, "epoch": 4.796413896152409, "step": 6420}, {"loss": 1.3761, "grad_norm": 0.790189266204834, "learning_rate": 0.0002, "epoch": 4.80388494583489, "step": 6430}, {"loss": 1.4109, "grad_norm": 0.6824303865432739, "learning_rate": 0.0002, "epoch": 4.811355995517371, "step": 6440}, {"loss": 1.3877, "grad_norm": 0.7501044869422913, "learning_rate": 0.0002, "epoch": 4.818827045199851, "step": 6450}, {"loss": 1.4458, "grad_norm": 0.8840398192405701, "learning_rate": 0.0002, "epoch": 4.826298094882331, "step": 6460}, {"loss": 1.4412, "grad_norm": 0.7812688946723938, "learning_rate": 0.0002, "epoch": 4.833769144564811, "step": 6470}, {"loss": 1.4299, "grad_norm": 0.7429926991462708, "learning_rate": 0.0002, "epoch": 4.841240194247292, "step": 6480}, {"loss": 1.5062, "grad_norm": 0.7778021693229675, "learning_rate": 0.0002, "epoch": 4.848711243929772, "step": 6490}, {"loss": 1.4589, "grad_norm": 0.8270702362060547, "learning_rate": 0.0002, "epoch": 4.856182293612252, "step": 6500}, {"loss": 1.4091, "grad_norm": 0.6960513591766357, "learning_rate": 0.0002, "epoch": 4.863653343294732, "step": 6510}, {"loss": 1.376, "grad_norm": 0.7728942632675171, "learning_rate": 0.0002, "epoch": 4.8711243929772134, "step": 6520}, {"loss": 1.4852, "grad_norm": 0.7377303838729858, "learning_rate": 0.0002, "epoch": 4.878595442659694, "step": 6530}, {"loss": 1.3846, "grad_norm": 0.7257253527641296, "learning_rate": 0.0002, "epoch": 4.886066492342174, "step": 6540}, {"loss": 1.4166, "grad_norm": 0.7875821590423584, "learning_rate": 0.0002, "epoch": 4.893537542024655, "step": 6550}, {"loss": 1.357, "grad_norm": 0.8346304297447205, "learning_rate": 0.0002, "epoch": 4.901008591707135, "step": 6560}, {"loss": 1.4522, "grad_norm": 0.7710739374160767, "learning_rate": 0.0002, "epoch": 4.908479641389615, "step": 6570}, {"loss": 1.4465, "grad_norm": 0.7015138268470764, "learning_rate": 0.0002, "epoch": 4.915950691072096, "step": 6580}, {"loss": 1.435, "grad_norm": 0.8707432150840759, "learning_rate": 0.0002, "epoch": 4.923421740754576, "step": 6590}, {"loss": 1.2968, "grad_norm": 0.786601185798645, "learning_rate": 0.0002, "epoch": 4.930892790437056, "step": 6600}, {"loss": 1.4385, "grad_norm": 0.978519082069397, "learning_rate": 0.0002, "epoch": 4.938363840119536, "step": 6610}, {"loss": 1.3997, "grad_norm": 0.8102927207946777, "learning_rate": 0.0002, "epoch": 4.9458348898020175, "step": 6620}, {"loss": 1.4859, "grad_norm": 0.7628704309463501, "learning_rate": 0.0002, "epoch": 4.953305939484498, "step": 6630}, {"loss": 1.3774, "grad_norm": 0.8053455352783203, "learning_rate": 0.0002, "epoch": 4.960776989166978, "step": 6640}, {"loss": 1.5092, "grad_norm": 0.8680412173271179, "learning_rate": 0.0002, "epoch": 4.968248038849458, "step": 6650}, {"loss": 1.3978, "grad_norm": 0.7415758371353149, "learning_rate": 0.0002, "epoch": 4.975719088531939, "step": 6660}, {"loss": 1.3793, "grad_norm": 0.7730312347412109, "learning_rate": 0.0002, "epoch": 4.983190138214419, "step": 6670}, {"loss": 1.4863, "grad_norm": 0.7924041152000427, "learning_rate": 0.0002, "epoch": 4.990661187896899, "step": 6680}, {"loss": 1.4137, "grad_norm": 0.8677893877029419, "learning_rate": 0.0002, "epoch": 4.99813223757938, "step": 6690}, {"eval_loss": 1.9444633722305298, "eval_runtime": 39.3488, "eval_samples_per_second": 13.088, "eval_steps_per_second": 1.652, "epoch": 4.999626447515876, "step": 6692}, {"loss": 1.3076, "grad_norm": 0.7102245092391968, "learning_rate": 0.0002, "epoch": 5.00560328726186, "step": 6700}, {"loss": 1.2714, "grad_norm": 1.0425463914871216, "learning_rate": 0.0002, "epoch": 5.0130743369443405, "step": 6710}, {"loss": 1.181, "grad_norm": 0.9320756793022156, "learning_rate": 0.0002, "epoch": 5.0205453866268215, "step": 6720}, {"loss": 1.1786, "grad_norm": 0.8797217607498169, "learning_rate": 0.0002, "epoch": 5.028016436309302, "step": 6730}, {"loss": 1.2097, "grad_norm": 2.135707139968872, "learning_rate": 0.0002, "epoch": 5.035487485991782, "step": 6740}, {"loss": 1.1761, "grad_norm": 0.8747734427452087, "learning_rate": 0.0002, "epoch": 5.042958535674262, "step": 6750}, {"loss": 1.1675, "grad_norm": 0.9981076717376709, "learning_rate": 0.0002, "epoch": 5.050429585356743, "step": 6760}, {"loss": 1.1976, "grad_norm": 0.985078752040863, "learning_rate": 0.0002, "epoch": 5.057900635039223, "step": 6770}, {"loss": 1.2688, "grad_norm": 1.0974019765853882, "learning_rate": 0.0002, "epoch": 5.065371684721703, "step": 6780}, {"loss": 1.1982, "grad_norm": 0.9823219180107117, "learning_rate": 0.0002, "epoch": 5.072842734404184, "step": 6790}, {"loss": 1.2586, "grad_norm": 1.122605562210083, "learning_rate": 0.0002, "epoch": 5.080313784086664, "step": 6800}, {"loss": 1.2069, "grad_norm": 0.8556802272796631, "learning_rate": 0.0002, "epoch": 5.0877848337691445, "step": 6810}, {"loss": 1.1908, "grad_norm": 1.1699262857437134, "learning_rate": 0.0002, "epoch": 5.095255883451625, "step": 6820}, {"loss": 1.1869, "grad_norm": 1.0440590381622314, "learning_rate": 0.0002, "epoch": 5.102726933134106, "step": 6830}, {"loss": 1.1655, "grad_norm": 1.0445300340652466, "learning_rate": 0.0002, "epoch": 5.110197982816586, "step": 6840}, {"loss": 1.2392, "grad_norm": 0.8289563059806824, "learning_rate": 0.0002, "epoch": 5.117669032499066, "step": 6850}, {"loss": 1.1687, "grad_norm": 1.1051193475723267, "learning_rate": 0.0002, "epoch": 5.125140082181547, "step": 6860}, {"loss": 1.2737, "grad_norm": 0.9345614910125732, "learning_rate": 0.0002, "epoch": 5.132611131864027, "step": 6870}, {"loss": 1.3021, "grad_norm": 1.1222996711730957, "learning_rate": 0.0002, "epoch": 5.140082181546507, "step": 6880}, {"loss": 1.2408, "grad_norm": 0.9405338764190674, "learning_rate": 0.0002, "epoch": 5.147553231228987, "step": 6890}, {"loss": 1.2367, "grad_norm": 1.0935171842575073, "learning_rate": 0.0002, "epoch": 5.155024280911468, "step": 6900}, {"loss": 1.2458, "grad_norm": 1.0438612699508667, "learning_rate": 0.0002, "epoch": 5.1624953305939485, "step": 6910}, {"loss": 1.2562, "grad_norm": 1.1189004182815552, "learning_rate": 0.0002, "epoch": 5.169966380276429, "step": 6920}, {"loss": 1.25, "grad_norm": 1.0533215999603271, "learning_rate": 0.0002, "epoch": 5.17743742995891, "step": 6930}, {"loss": 1.2974, "grad_norm": 0.9779648780822754, "learning_rate": 0.0002, "epoch": 5.18490847964139, "step": 6940}, {"loss": 1.1965, "grad_norm": 0.8920868635177612, "learning_rate": 0.0002, "epoch": 5.19237952932387, "step": 6950}, {"loss": 1.283, "grad_norm": 0.8374548554420471, "learning_rate": 0.0002, "epoch": 5.19985057900635, "step": 6960}, {"loss": 1.2775, "grad_norm": 1.0490682125091553, "learning_rate": 0.0002, "epoch": 5.207321628688831, "step": 6970}, {"loss": 1.1826, "grad_norm": 0.9658287167549133, "learning_rate": 0.0002, "epoch": 5.214792678371311, "step": 6980}, {"loss": 1.2647, "grad_norm": 0.9652056097984314, "learning_rate": 0.0002, "epoch": 5.222263728053791, "step": 6990}, {"loss": 1.3023, "grad_norm": 0.9141794443130493, "learning_rate": 0.0002, "epoch": 5.229734777736272, "step": 7000}, {"loss": 1.2456, "grad_norm": 0.9831376671791077, "learning_rate": 0.0002, "epoch": 5.2372058274187525, "step": 7010}, {"loss": 1.2176, "grad_norm": 1.0198718309402466, "learning_rate": 0.0002, "epoch": 5.244676877101233, "step": 7020}, {"loss": 1.2643, "grad_norm": 0.9647888541221619, "learning_rate": 0.0002, "epoch": 5.252147926783713, "step": 7030}, {"loss": 1.2106, "grad_norm": 1.3941649198532104, "learning_rate": 0.0002, "epoch": 5.259618976466194, "step": 7040}, {"loss": 1.2885, "grad_norm": 1.0305466651916504, "learning_rate": 0.0002, "epoch": 5.267090026148674, "step": 7050}, {"loss": 1.2362, "grad_norm": 0.9577859044075012, "learning_rate": 0.0002, "epoch": 5.274561075831154, "step": 7060}, {"loss": 1.2231, "grad_norm": 1.149092197418213, "learning_rate": 0.0002, "epoch": 5.282032125513634, "step": 7070}, {"loss": 1.2986, "grad_norm": 1.2582733631134033, "learning_rate": 0.0002, "epoch": 5.289503175196115, "step": 7080}, {"loss": 1.2307, "grad_norm": 1.1777442693710327, "learning_rate": 0.0002, "epoch": 5.296974224878595, "step": 7090}, {"loss": 1.24, "grad_norm": 1.0076404809951782, "learning_rate": 0.0002, "epoch": 5.3044452745610755, "step": 7100}, {"loss": 1.1407, "grad_norm": 0.9037365913391113, "learning_rate": 0.0002, "epoch": 5.3119163242435565, "step": 7110}, {"loss": 1.238, "grad_norm": 0.9428724646568298, "learning_rate": 0.0002, "epoch": 5.319387373926037, "step": 7120}, {"loss": 1.2571, "grad_norm": 0.9935154318809509, "learning_rate": 0.0002, "epoch": 5.326858423608517, "step": 7130}, {"loss": 1.2833, "grad_norm": 1.087500810623169, "learning_rate": 0.0002, "epoch": 5.334329473290998, "step": 7140}, {"loss": 1.2304, "grad_norm": 0.8543072938919067, "learning_rate": 0.0002, "epoch": 5.341800522973478, "step": 7150}, {"loss": 1.2755, "grad_norm": 0.9323700070381165, "learning_rate": 0.0002, "epoch": 5.349271572655958, "step": 7160}, {"loss": 1.2769, "grad_norm": 1.0037827491760254, "learning_rate": 0.0002, "epoch": 5.356742622338438, "step": 7170}, {"loss": 1.3204, "grad_norm": 0.8746469616889954, "learning_rate": 0.0002, "epoch": 5.364213672020919, "step": 7180}, {"loss": 1.2759, "grad_norm": 0.9516328573226929, "learning_rate": 0.0002, "epoch": 5.371684721703399, "step": 7190}, {"loss": 1.2428, "grad_norm": 0.9395177364349365, "learning_rate": 0.0002, "epoch": 5.3791557713858795, "step": 7200}, {"loss": 1.3214, "grad_norm": 1.000369906425476, "learning_rate": 0.0002, "epoch": 5.38662682106836, "step": 7210}, {"loss": 1.2337, "grad_norm": 1.0845502614974976, "learning_rate": 0.0002, "epoch": 5.394097870750841, "step": 7220}, {"loss": 1.2776, "grad_norm": 0.8975145220756531, "learning_rate": 0.0002, "epoch": 5.401568920433321, "step": 7230}, {"loss": 1.2306, "grad_norm": 1.040077805519104, "learning_rate": 0.0002, "epoch": 5.409039970115801, "step": 7240}, {"loss": 1.2277, "grad_norm": 1.0729942321777344, "learning_rate": 0.0002, "epoch": 5.416511019798282, "step": 7250}, {"loss": 1.2714, "grad_norm": 0.8322232961654663, "learning_rate": 0.0002, "epoch": 5.423982069480762, "step": 7260}, {"loss": 1.3036, "grad_norm": 1.0654641389846802, "learning_rate": 0.0002, "epoch": 5.431453119163242, "step": 7270}, {"loss": 1.268, "grad_norm": 1.0445852279663086, "learning_rate": 0.0002, "epoch": 5.438924168845723, "step": 7280}, {"loss": 1.2743, "grad_norm": 1.0762956142425537, "learning_rate": 0.0002, "epoch": 5.446395218528203, "step": 7290}, {"loss": 1.2887, "grad_norm": 0.9721953868865967, "learning_rate": 0.0002, "epoch": 5.4538662682106835, "step": 7300}, {"loss": 1.2833, "grad_norm": 0.9238539338111877, "learning_rate": 0.0002, "epoch": 5.461337317893164, "step": 7310}, {"loss": 1.255, "grad_norm": 0.9912874102592468, "learning_rate": 0.0002, "epoch": 5.468808367575645, "step": 7320}, {"loss": 1.2557, "grad_norm": 1.0727077722549438, "learning_rate": 0.0002, "epoch": 5.476279417258125, "step": 7330}, {"loss": 1.3471, "grad_norm": 0.8633865118026733, "learning_rate": 0.0002, "epoch": 5.483750466940605, "step": 7340}, {"loss": 1.3155, "grad_norm": 0.9396262764930725, "learning_rate": 0.0002, "epoch": 5.491221516623085, "step": 7350}, {"loss": 1.3146, "grad_norm": 1.0253715515136719, "learning_rate": 0.0002, "epoch": 5.498692566305566, "step": 7360}, {"loss": 1.3156, "grad_norm": 1.006047010421753, "learning_rate": 0.0002, "epoch": 5.506163615988046, "step": 7370}, {"loss": 1.3107, "grad_norm": 0.9781233072280884, "learning_rate": 0.0002, "epoch": 5.513634665670526, "step": 7380}, {"loss": 1.2703, "grad_norm": 0.9945126175880432, "learning_rate": 0.0002, "epoch": 5.521105715353007, "step": 7390}, {"loss": 1.1936, "grad_norm": 0.9081175327301025, "learning_rate": 0.0002, "epoch": 5.528576765035488, "step": 7400}, {"loss": 1.2651, "grad_norm": 1.2215938568115234, "learning_rate": 0.0002, "epoch": 5.536047814717968, "step": 7410}, {"loss": 1.2484, "grad_norm": 1.0724077224731445, "learning_rate": 0.0002, "epoch": 5.543518864400449, "step": 7420}, {"loss": 1.3083, "grad_norm": 1.106955885887146, "learning_rate": 0.0002, "epoch": 5.550989914082929, "step": 7430}, {"loss": 1.2125, "grad_norm": 1.0657650232315063, "learning_rate": 0.0002, "epoch": 5.558460963765409, "step": 7440}, {"loss": 1.2576, "grad_norm": 0.9725455641746521, "learning_rate": 0.0002, "epoch": 5.565932013447889, "step": 7450}, {"loss": 1.3297, "grad_norm": 0.8604224324226379, "learning_rate": 0.0002, "epoch": 5.57340306313037, "step": 7460}, {"loss": 1.3084, "grad_norm": 0.9913371205329895, "learning_rate": 0.0002, "epoch": 5.58087411281285, "step": 7470}, {"loss": 1.3371, "grad_norm": 1.012073040008545, "learning_rate": 0.0002, "epoch": 5.58834516249533, "step": 7480}, {"loss": 1.2526, "grad_norm": 1.1003159284591675, "learning_rate": 0.0002, "epoch": 5.5958162121778106, "step": 7490}, {"loss": 1.2577, "grad_norm": 0.9104593992233276, "learning_rate": 0.0002, "epoch": 5.603287261860292, "step": 7500}, {"loss": 1.2578, "grad_norm": 0.9480831623077393, "learning_rate": 0.0002, "epoch": 5.610758311542772, "step": 7510}, {"loss": 1.3056, "grad_norm": 1.0826456546783447, "learning_rate": 0.0002, "epoch": 5.618229361225252, "step": 7520}, {"loss": 1.2931, "grad_norm": 0.8286259174346924, "learning_rate": 0.0002, "epoch": 5.625700410907733, "step": 7530}, {"loss": 1.2918, "grad_norm": 0.9145061373710632, "learning_rate": 0.0002, "epoch": 5.633171460590213, "step": 7540}, {"loss": 1.1736, "grad_norm": 0.9363601803779602, "learning_rate": 0.0002, "epoch": 5.640642510272693, "step": 7550}, {"loss": 1.2265, "grad_norm": 0.9553244709968567, "learning_rate": 0.0002, "epoch": 5.648113559955174, "step": 7560}, {"loss": 1.2356, "grad_norm": 1.0343557596206665, "learning_rate": 0.0002, "epoch": 5.655584609637654, "step": 7570}, {"loss": 1.3171, "grad_norm": 0.8734238743782043, "learning_rate": 0.0002, "epoch": 5.663055659320134, "step": 7580}, {"loss": 1.2785, "grad_norm": 1.0230586528778076, "learning_rate": 0.0002, "epoch": 5.670526709002615, "step": 7590}, {"loss": 1.2936, "grad_norm": 1.0063409805297852, "learning_rate": 0.0002, "epoch": 5.677997758685096, "step": 7600}, {"loss": 1.2396, "grad_norm": 1.0104626417160034, "learning_rate": 0.0002, "epoch": 5.685468808367576, "step": 7610}, {"loss": 1.2581, "grad_norm": 0.9528168439865112, "learning_rate": 0.0002, "epoch": 5.692939858050056, "step": 7620}, {"loss": 1.3116, "grad_norm": 0.9799878597259521, "learning_rate": 0.0002, "epoch": 5.700410907732536, "step": 7630}, {"loss": 1.2632, "grad_norm": 0.969351589679718, "learning_rate": 0.0002, "epoch": 5.707881957415017, "step": 7640}, {"loss": 1.3055, "grad_norm": 1.3037652969360352, "learning_rate": 0.0002, "epoch": 5.715353007097497, "step": 7650}, {"loss": 1.3126, "grad_norm": 1.0640486478805542, "learning_rate": 0.0002, "epoch": 5.722824056779977, "step": 7660}, {"loss": 1.3325, "grad_norm": 1.0416420698165894, "learning_rate": 0.0002, "epoch": 5.730295106462458, "step": 7670}, {"loss": 1.25, "grad_norm": 0.8893619775772095, "learning_rate": 0.0002, "epoch": 5.7377661561449385, "step": 7680}, {"loss": 1.319, "grad_norm": 0.8512844443321228, "learning_rate": 0.0002, "epoch": 5.745237205827419, "step": 7690}, {"loss": 1.3328, "grad_norm": 0.9955748319625854, "learning_rate": 0.0002, "epoch": 5.7527082555099, "step": 7700}, {"loss": 1.294, "grad_norm": 1.0409910678863525, "learning_rate": 0.0002, "epoch": 5.76017930519238, "step": 7710}, {"loss": 1.3518, "grad_norm": 1.010097861289978, "learning_rate": 0.0002, "epoch": 5.76765035487486, "step": 7720}, {"loss": 1.2106, "grad_norm": 0.8974892497062683, "learning_rate": 0.0002, "epoch": 5.77512140455734, "step": 7730}, {"loss": 1.2743, "grad_norm": 0.972835123538971, "learning_rate": 0.0002, "epoch": 5.782592454239821, "step": 7740}, {"loss": 1.3549, "grad_norm": 0.9607440829277039, "learning_rate": 0.0002, "epoch": 5.790063503922301, "step": 7750}, {"loss": 1.29, "grad_norm": 0.9426500797271729, "learning_rate": 0.0002, "epoch": 5.797534553604781, "step": 7760}, {"loss": 1.274, "grad_norm": 0.8745320439338684, "learning_rate": 0.0002, "epoch": 5.8050056032872615, "step": 7770}, {"loss": 1.3009, "grad_norm": 1.0117204189300537, "learning_rate": 0.0002, "epoch": 5.8124766529697425, "step": 7780}, {"loss": 1.3135, "grad_norm": 1.0387755632400513, "learning_rate": 0.0002, "epoch": 5.819947702652223, "step": 7790}, {"loss": 1.2709, "grad_norm": 1.0709784030914307, "learning_rate": 0.0002, "epoch": 5.827418752334703, "step": 7800}, {"loss": 1.225, "grad_norm": 0.9512667655944824, "learning_rate": 0.0002, "epoch": 5.834889802017184, "step": 7810}, {"loss": 1.3284, "grad_norm": 1.021094560623169, "learning_rate": 0.0002, "epoch": 5.842360851699664, "step": 7820}, {"loss": 1.2794, "grad_norm": 1.117491364479065, "learning_rate": 0.0002, "epoch": 5.849831901382144, "step": 7830}, {"loss": 1.3646, "grad_norm": 0.9252554178237915, "learning_rate": 0.0002, "epoch": 5.857302951064625, "step": 7840}, {"loss": 1.2976, "grad_norm": 1.1416207551956177, "learning_rate": 0.0002, "epoch": 5.864774000747105, "step": 7850}, {"loss": 1.3293, "grad_norm": 1.1219907999038696, "learning_rate": 0.0002, "epoch": 5.872245050429585, "step": 7860}, {"loss": 1.2334, "grad_norm": 0.8300467729568481, "learning_rate": 0.0002, "epoch": 5.8797161001120655, "step": 7870}, {"loss": 1.3132, "grad_norm": 1.00551438331604, "learning_rate": 0.0002, "epoch": 5.8871871497945465, "step": 7880}, {"loss": 1.2609, "grad_norm": 0.8981153964996338, "learning_rate": 0.0002, "epoch": 5.894658199477027, "step": 7890}, {"loss": 1.2817, "grad_norm": 1.0247976779937744, "learning_rate": 0.0002, "epoch": 5.902129249159507, "step": 7900}, {"loss": 1.2866, "grad_norm": 1.0820319652557373, "learning_rate": 0.0002, "epoch": 5.909600298841987, "step": 7910}, {"loss": 1.2941, "grad_norm": 0.952675461769104, "learning_rate": 0.0002, "epoch": 5.917071348524468, "step": 7920}, {"loss": 1.307, "grad_norm": 0.8666740655899048, "learning_rate": 0.0002, "epoch": 5.924542398206948, "step": 7930}, {"loss": 1.2752, "grad_norm": 0.8640421032905579, "learning_rate": 0.0002, "epoch": 5.932013447889428, "step": 7940}, {"loss": 1.2386, "grad_norm": 1.2343276739120483, "learning_rate": 0.0002, "epoch": 5.939484497571909, "step": 7950}, {"loss": 1.2333, "grad_norm": 0.958046555519104, "learning_rate": 0.0002, "epoch": 5.946955547254389, "step": 7960}, {"loss": 1.2352, "grad_norm": 1.0538510084152222, "learning_rate": 0.0002, "epoch": 5.9544265969368695, "step": 7970}, {"loss": 1.3233, "grad_norm": 1.2681571245193481, "learning_rate": 0.0002, "epoch": 5.9618976466193505, "step": 7980}, {"loss": 1.2514, "grad_norm": 0.8171183466911316, "learning_rate": 0.0002, "epoch": 5.969368696301831, "step": 7990}, {"loss": 1.3412, "grad_norm": 0.9109523892402649, "learning_rate": 0.0002, "epoch": 5.976839745984311, "step": 8000}, {"loss": 1.3497, "grad_norm": 1.0040639638900757, "learning_rate": 0.0002, "epoch": 5.984310795666791, "step": 8010}, {"loss": 1.3299, "grad_norm": 0.9596554040908813, "learning_rate": 0.0002, "epoch": 5.991781845349272, "step": 8020}, {"loss": 1.3109, "grad_norm": 0.9782963991165161, "learning_rate": 0.0002, "epoch": 5.999252895031752, "step": 8030}, {"eval_loss": 2.0417845249176025, "eval_runtime": 38.8465, "eval_samples_per_second": 13.257, "eval_steps_per_second": 1.673, "epoch": 6.0, "step": 8031}, {"loss": 1.0886, "grad_norm": 1.380823016166687, "learning_rate": 0.0002, "epoch": 6.006723944714232, "step": 8040}, {"loss": 1.0413, "grad_norm": 1.067636251449585, "learning_rate": 0.0002, "epoch": 6.014194994396712, "step": 8050}, {"loss": 1.0686, "grad_norm": 1.363402009010315, "learning_rate": 0.0002, "epoch": 6.021666044079193, "step": 8060}, {"loss": 1.0762, "grad_norm": 0.9901054501533508, "learning_rate": 0.0002, "epoch": 6.0291370937616735, "step": 8070}, {"loss": 1.1182, "grad_norm": 1.1545379161834717, "learning_rate": 0.0002, "epoch": 6.036608143444154, "step": 8080}, {"loss": 1.0644, "grad_norm": 1.2259265184402466, "learning_rate": 0.0002, "epoch": 6.044079193126635, "step": 8090}, {"loss": 1.1273, "grad_norm": 1.1237425804138184, "learning_rate": 0.0002, "epoch": 6.051550242809115, "step": 8100}, {"loss": 1.1001, "grad_norm": 1.2805622816085815, "learning_rate": 0.0002, "epoch": 6.059021292491595, "step": 8110}, {"loss": 1.0731, "grad_norm": 1.2270452976226807, "learning_rate": 0.0002, "epoch": 6.066492342174075, "step": 8120}, {"loss": 1.0692, "grad_norm": 1.1924101114273071, "learning_rate": 0.0002, "epoch": 6.073963391856556, "step": 8130}, {"loss": 1.1698, "grad_norm": 1.2543894052505493, "learning_rate": 0.0002, "epoch": 6.081434441539036, "step": 8140}, {"loss": 1.069, "grad_norm": 1.1821149587631226, "learning_rate": 0.0002, "epoch": 6.088905491221516, "step": 8150}, {"loss": 1.109, "grad_norm": 1.2202836275100708, "learning_rate": 0.0002, "epoch": 6.096376540903997, "step": 8160}, {"loss": 1.136, "grad_norm": 1.0576019287109375, "learning_rate": 0.0002, "epoch": 6.1038475905864775, "step": 8170}, {"loss": 1.1395, "grad_norm": 1.31708824634552, "learning_rate": 0.0002, "epoch": 6.111318640268958, "step": 8180}, {"loss": 1.0887, "grad_norm": 1.0479495525360107, "learning_rate": 0.0002, "epoch": 6.118789689951438, "step": 8190}, {"loss": 1.0764, "grad_norm": 1.285003423690796, "learning_rate": 0.0002, "epoch": 6.126260739633919, "step": 8200}, {"loss": 1.0642, "grad_norm": 1.0989165306091309, "learning_rate": 0.0002, "epoch": 6.133731789316399, "step": 8210}, {"loss": 1.0981, "grad_norm": 1.1659013032913208, "learning_rate": 0.0002, "epoch": 6.141202838998879, "step": 8220}, {"loss": 1.1138, "grad_norm": 1.2796376943588257, "learning_rate": 0.0002, "epoch": 6.14867388868136, "step": 8230}, {"loss": 1.1116, "grad_norm": 1.060564637184143, "learning_rate": 0.0002, "epoch": 6.15614493836384, "step": 8240}, {"loss": 1.1493, "grad_norm": 1.3884605169296265, "learning_rate": 0.0002, "epoch": 6.16361598804632, "step": 8250}, {"loss": 1.0504, "grad_norm": 1.1570569276809692, "learning_rate": 0.0002, "epoch": 6.1710870377288005, "step": 8260}, {"loss": 1.0386, "grad_norm": 1.4136502742767334, "learning_rate": 0.0002, "epoch": 6.1785580874112815, "step": 8270}, {"loss": 1.0882, "grad_norm": 1.3396095037460327, "learning_rate": 0.0002, "epoch": 6.186029137093762, "step": 8280}, {"loss": 1.133, "grad_norm": 1.2549997568130493, "learning_rate": 0.0002, "epoch": 6.193500186776242, "step": 8290}, {"loss": 1.0626, "grad_norm": 1.3629751205444336, "learning_rate": 0.0002, "epoch": 6.200971236458723, "step": 8300}, {"loss": 1.1343, "grad_norm": 1.1029163599014282, "learning_rate": 0.0002, "epoch": 6.208442286141203, "step": 8310}, {"loss": 1.0895, "grad_norm": 1.1992450952529907, "learning_rate": 0.0002, "epoch": 6.215913335823683, "step": 8320}, {"loss": 1.1417, "grad_norm": 1.3317986726760864, "learning_rate": 0.0002, "epoch": 6.223384385506163, "step": 8330}, {"loss": 1.0958, "grad_norm": 1.0538336038589478, "learning_rate": 0.0002, "epoch": 6.230855435188644, "step": 8340}, {"loss": 1.1557, "grad_norm": 1.1767704486846924, "learning_rate": 0.0002, "epoch": 6.238326484871124, "step": 8350}, {"loss": 1.1038, "grad_norm": 1.1213016510009766, "learning_rate": 0.0002, "epoch": 6.2457975345536045, "step": 8360}, {"loss": 1.1241, "grad_norm": 1.1895716190338135, "learning_rate": 0.0002, "epoch": 6.253268584236086, "step": 8370}, {"loss": 1.1171, "grad_norm": 1.1078153848648071, "learning_rate": 0.0002, "epoch": 6.260739633918566, "step": 8380}, {"loss": 1.1124, "grad_norm": 1.1662801504135132, "learning_rate": 0.0002, "epoch": 6.268210683601046, "step": 8390}, {"loss": 1.125, "grad_norm": 1.2071197032928467, "learning_rate": 0.0002, "epoch": 6.275681733283526, "step": 8400}, {"loss": 1.0625, "grad_norm": 1.2653778791427612, "learning_rate": 0.0002, "epoch": 6.283152782966007, "step": 8410}, {"loss": 1.0565, "grad_norm": 1.6128872632980347, "learning_rate": 0.0002, "epoch": 6.290623832648487, "step": 8420}, {"loss": 1.1212, "grad_norm": 1.4993070363998413, "learning_rate": 0.0002, "epoch": 6.298094882330967, "step": 8430}, {"loss": 1.1516, "grad_norm": 1.16339910030365, "learning_rate": 0.0002, "epoch": 6.305565932013448, "step": 8440}, {"loss": 1.0662, "grad_norm": 1.256822943687439, "learning_rate": 0.0002, "epoch": 6.313036981695928, "step": 8450}, {"loss": 1.1566, "grad_norm": 1.1352964639663696, "learning_rate": 0.0002, "epoch": 6.3205080313784086, "step": 8460}, {"loss": 1.1297, "grad_norm": 1.0061070919036865, "learning_rate": 0.0002, "epoch": 6.327979081060889, "step": 8470}, {"loss": 1.0967, "grad_norm": 1.1901768445968628, "learning_rate": 0.0002, "epoch": 6.33545013074337, "step": 8480}, {"loss": 1.1463, "grad_norm": 1.2715139389038086, "learning_rate": 0.0002, "epoch": 6.34292118042585, "step": 8490}, {"loss": 1.2143, "grad_norm": 1.1583346128463745, "learning_rate": 0.0002, "epoch": 6.35039223010833, "step": 8500}, {"loss": 1.1072, "grad_norm": 1.1427477598190308, "learning_rate": 0.0002, "epoch": 6.357863279790811, "step": 8510}, {"loss": 1.1119, "grad_norm": 1.1952263116836548, "learning_rate": 0.0002, "epoch": 6.365334329473291, "step": 8520}, {"loss": 1.0797, "grad_norm": 1.0599623918533325, "learning_rate": 0.0002, "epoch": 6.372805379155771, "step": 8530}, {"loss": 1.1091, "grad_norm": 1.3511574268341064, "learning_rate": 0.0002, "epoch": 6.380276428838251, "step": 8540}, {"loss": 1.1272, "grad_norm": 1.171126127243042, "learning_rate": 0.0002, "epoch": 6.387747478520732, "step": 8550}, {"loss": 1.1615, "grad_norm": 1.285474419593811, "learning_rate": 0.0002, "epoch": 6.395218528203213, "step": 8560}, {"loss": 1.1505, "grad_norm": 0.9751279950141907, "learning_rate": 0.0002, "epoch": 6.402689577885693, "step": 8570}, {"loss": 1.1502, "grad_norm": 1.2194149494171143, "learning_rate": 0.0002, "epoch": 6.410160627568174, "step": 8580}, {"loss": 1.138, "grad_norm": 1.255888819694519, "learning_rate": 0.0002, "epoch": 6.417631677250654, "step": 8590}, {"loss": 1.1308, "grad_norm": 1.1636122465133667, "learning_rate": 0.0002, "epoch": 6.425102726933134, "step": 8600}, {"loss": 1.1398, "grad_norm": 1.0769859552383423, "learning_rate": 0.0002, "epoch": 6.432573776615614, "step": 8610}, {"loss": 1.1183, "grad_norm": 1.151778221130371, "learning_rate": 0.0002, "epoch": 6.440044826298095, "step": 8620}, {"loss": 1.0706, "grad_norm": 1.2749944925308228, "learning_rate": 0.0002, "epoch": 6.447515875980575, "step": 8630}, {"loss": 1.1011, "grad_norm": 1.1925828456878662, "learning_rate": 0.0002, "epoch": 6.454986925663055, "step": 8640}, {"loss": 1.1581, "grad_norm": 1.166107416152954, "learning_rate": 0.0002, "epoch": 6.4624579753455365, "step": 8650}, {"loss": 1.105, "grad_norm": 1.0372248888015747, "learning_rate": 0.0002, "epoch": 6.469929025028017, "step": 8660}, {"loss": 1.1546, "grad_norm": 1.26933753490448, "learning_rate": 0.0002, "epoch": 6.477400074710497, "step": 8670}, {"loss": 1.2362, "grad_norm": 1.2154223918914795, "learning_rate": 0.0002, "epoch": 6.484871124392977, "step": 8680}, {"loss": 1.1096, "grad_norm": 1.09475839138031, "learning_rate": 0.0002, "epoch": 6.492342174075458, "step": 8690}, {"loss": 1.1168, "grad_norm": 1.0763037204742432, "learning_rate": 0.0002, "epoch": 6.499813223757938, "step": 8700}, {"loss": 1.1993, "grad_norm": 1.1882896423339844, "learning_rate": 0.0002, "epoch": 6.507284273440418, "step": 8710}, {"loss": 1.1498, "grad_norm": 1.1662089824676514, "learning_rate": 0.0002, "epoch": 6.514755323122898, "step": 8720}, {"loss": 1.2008, "grad_norm": 1.3259495496749878, "learning_rate": 0.0002, "epoch": 6.522226372805379, "step": 8730}, {"loss": 1.1289, "grad_norm": 1.0858017206192017, "learning_rate": 0.0002, "epoch": 6.5296974224878594, "step": 8740}, {"loss": 1.1335, "grad_norm": 1.240337610244751, "learning_rate": 0.0002, "epoch": 6.53716847217034, "step": 8750}, {"loss": 1.1479, "grad_norm": 1.1381462812423706, "learning_rate": 0.0002, "epoch": 6.544639521852821, "step": 8760}, {"loss": 1.0991, "grad_norm": 1.2220063209533691, "learning_rate": 0.0002, "epoch": 6.552110571535301, "step": 8770}, {"loss": 1.159, "grad_norm": 1.1553083658218384, "learning_rate": 0.0002, "epoch": 6.559581621217781, "step": 8780}, {"loss": 1.0996, "grad_norm": 1.1383219957351685, "learning_rate": 0.0002, "epoch": 6.567052670900262, "step": 8790}, {"loss": 1.1355, "grad_norm": 1.0379676818847656, "learning_rate": 0.0002, "epoch": 6.574523720582742, "step": 8800}, {"loss": 1.1704, "grad_norm": 1.376488447189331, "learning_rate": 0.0002, "epoch": 6.581994770265222, "step": 8810}, {"loss": 1.1265, "grad_norm": 1.1586211919784546, "learning_rate": 0.0002, "epoch": 6.589465819947702, "step": 8820}, {"loss": 1.1904, "grad_norm": 1.28152334690094, "learning_rate": 0.0002, "epoch": 6.596936869630183, "step": 8830}, {"loss": 1.1646, "grad_norm": 1.2656810283660889, "learning_rate": 0.0002, "epoch": 6.6044079193126635, "step": 8840}, {"loss": 1.1865, "grad_norm": 1.0636502504348755, "learning_rate": 0.0002, "epoch": 6.611878968995144, "step": 8850}, {"loss": 1.125, "grad_norm": 1.273239254951477, "learning_rate": 0.0002, "epoch": 6.619350018677624, "step": 8860}, {"loss": 1.1443, "grad_norm": 1.1055482625961304, "learning_rate": 0.0002, "epoch": 6.626821068360105, "step": 8870}, {"loss": 1.0877, "grad_norm": 1.1934176683425903, "learning_rate": 0.0002, "epoch": 6.634292118042585, "step": 8880}, {"loss": 1.194, "grad_norm": 1.2248114347457886, "learning_rate": 0.0002, "epoch": 6.641763167725065, "step": 8890}, {"loss": 1.1609, "grad_norm": 1.1950982809066772, "learning_rate": 0.0002, "epoch": 6.649234217407546, "step": 8900}, {"loss": 1.169, "grad_norm": 1.0821784734725952, "learning_rate": 0.0002, "epoch": 6.656705267090026, "step": 8910}, {"loss": 1.1337, "grad_norm": 1.0062463283538818, "learning_rate": 0.0002, "epoch": 6.664176316772506, "step": 8920}, {"loss": 1.1403, "grad_norm": 1.2373089790344238, "learning_rate": 0.0002, "epoch": 6.671647366454987, "step": 8930}, {"loss": 1.2051, "grad_norm": 1.1821746826171875, "learning_rate": 0.0002, "epoch": 6.6791184161374675, "step": 8940}, {"loss": 1.1214, "grad_norm": 1.2350659370422363, "learning_rate": 0.0002, "epoch": 6.686589465819948, "step": 8950}, {"loss": 1.225, "grad_norm": 1.1012883186340332, "learning_rate": 0.0002, "epoch": 6.694060515502428, "step": 8960}, {"loss": 1.2111, "grad_norm": 1.2008943557739258, "learning_rate": 0.0002, "epoch": 6.701531565184909, "step": 8970}, {"loss": 1.1769, "grad_norm": 1.2355504035949707, "learning_rate": 0.0002, "epoch": 6.709002614867389, "step": 8980}, {"loss": 1.1323, "grad_norm": 1.2367502450942993, "learning_rate": 0.0002, "epoch": 6.716473664549869, "step": 8990}, {"loss": 1.1235, "grad_norm": 1.1075866222381592, "learning_rate": 0.0002, "epoch": 6.723944714232349, "step": 9000}, {"loss": 1.1239, "grad_norm": 1.246480941772461, "learning_rate": 0.0002, "epoch": 6.73141576391483, "step": 9010}, {"loss": 1.2154, "grad_norm": 1.1252824068069458, "learning_rate": 0.0002, "epoch": 6.73888681359731, "step": 9020}, {"loss": 1.1762, "grad_norm": 1.0706887245178223, "learning_rate": 0.0002, "epoch": 6.7463578632797905, "step": 9030}, {"loss": 1.1961, "grad_norm": 1.0874755382537842, "learning_rate": 0.0002, "epoch": 6.7538289129622715, "step": 9040}, {"loss": 1.0889, "grad_norm": 1.121434211730957, "learning_rate": 0.0002, "epoch": 6.761299962644752, "step": 9050}, {"loss": 1.2018, "grad_norm": 1.1517996788024902, "learning_rate": 0.0002, "epoch": 6.768771012327232, "step": 9060}, {"loss": 1.1593, "grad_norm": 1.2484540939331055, "learning_rate": 0.0002, "epoch": 6.776242062009713, "step": 9070}, {"loss": 1.13, "grad_norm": 1.023059368133545, "learning_rate": 0.0002, "epoch": 6.783713111692193, "step": 9080}, {"loss": 1.1929, "grad_norm": 1.1334631443023682, "learning_rate": 0.0002, "epoch": 6.791184161374673, "step": 9090}, {"loss": 1.18, "grad_norm": 1.2991816997528076, "learning_rate": 0.0002, "epoch": 6.798655211057153, "step": 9100}, {"loss": 1.2398, "grad_norm": 1.4147199392318726, "learning_rate": 0.0002, "epoch": 6.806126260739634, "step": 9110}, {"loss": 1.0958, "grad_norm": 1.1353832483291626, "learning_rate": 0.0002, "epoch": 6.813597310422114, "step": 9120}, {"loss": 1.1379, "grad_norm": 1.0332539081573486, "learning_rate": 0.0002, "epoch": 6.8210683601045945, "step": 9130}, {"loss": 1.1652, "grad_norm": 1.2208142280578613, "learning_rate": 0.0002, "epoch": 6.828539409787075, "step": 9140}, {"loss": 1.1463, "grad_norm": 1.3033398389816284, "learning_rate": 0.0002, "epoch": 6.836010459469556, "step": 9150}, {"loss": 1.1834, "grad_norm": 1.2676737308502197, "learning_rate": 0.0002, "epoch": 6.843481509152036, "step": 9160}, {"loss": 1.1786, "grad_norm": 1.1668603420257568, "learning_rate": 0.0002, "epoch": 6.850952558834516, "step": 9170}, {"loss": 1.1801, "grad_norm": 1.1994788646697998, "learning_rate": 0.0002, "epoch": 6.858423608516997, "step": 9180}, {"loss": 1.2131, "grad_norm": 1.231873869895935, "learning_rate": 0.0002, "epoch": 6.865894658199477, "step": 9190}, {"loss": 1.2109, "grad_norm": 0.9981484413146973, "learning_rate": 0.0002, "epoch": 6.873365707881957, "step": 9200}, {"loss": 1.1084, "grad_norm": 1.2799428701400757, "learning_rate": 0.0002, "epoch": 6.880836757564438, "step": 9210}, {"loss": 1.2004, "grad_norm": 1.2042057514190674, "learning_rate": 0.0002, "epoch": 6.888307807246918, "step": 9220}, {"loss": 1.1567, "grad_norm": 1.070420265197754, "learning_rate": 0.0002, "epoch": 6.8957788569293985, "step": 9230}, {"loss": 1.1353, "grad_norm": 1.327160358428955, "learning_rate": 0.0002, "epoch": 6.903249906611879, "step": 9240}, {"loss": 1.1945, "grad_norm": 1.1109007596969604, "learning_rate": 0.0002, "epoch": 6.91072095629436, "step": 9250}, {"loss": 1.1701, "grad_norm": 1.1669930219650269, "learning_rate": 0.0002, "epoch": 6.91819200597684, "step": 9260}, {"loss": 1.1854, "grad_norm": 1.034532904624939, "learning_rate": 0.0002, "epoch": 6.92566305565932, "step": 9270}, {"loss": 1.1712, "grad_norm": 1.1035540103912354, "learning_rate": 0.0002, "epoch": 6.9331341053418, "step": 9280}, {"loss": 1.1767, "grad_norm": 1.366254448890686, "learning_rate": 0.0002, "epoch": 6.940605155024281, "step": 9290}, {"loss": 1.1591, "grad_norm": 1.094214677810669, "learning_rate": 0.0002, "epoch": 6.948076204706761, "step": 9300}, {"loss": 1.18, "grad_norm": 1.131238579750061, "learning_rate": 0.0002, "epoch": 6.955547254389241, "step": 9310}, {"loss": 1.2513, "grad_norm": 1.202369213104248, "learning_rate": 0.0002, "epoch": 6.963018304071722, "step": 9320}, {"loss": 1.1922, "grad_norm": 1.1067225933074951, "learning_rate": 0.0002, "epoch": 6.9704893537542025, "step": 9330}, {"loss": 1.1965, "grad_norm": 1.0258643627166748, "learning_rate": 0.0002, "epoch": 6.977960403436683, "step": 9340}, {"loss": 1.2053, "grad_norm": 1.3311655521392822, "learning_rate": 0.0002, "epoch": 6.985431453119164, "step": 9350}, {"loss": 1.1778, "grad_norm": 1.1245559453964233, "learning_rate": 0.0002, "epoch": 6.992902502801644, "step": 9360}]} +{"epoch": 7.997011580127008, "step": 10704, "epoch_duration": 1871.7742564678192, "total_accumulated_duration": 13167.358186483383, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-6326-sd-42/checkpoint-2677", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6181, "grad_norm": 0.4912872612476349, "learning_rate": 0.0002, "epoch": 0.007471049682480389, "step": 10}, {"loss": 2.2606, "grad_norm": 0.4856316149234772, "learning_rate": 0.0002, "epoch": 0.014942099364960777, "step": 20}, {"loss": 2.0957, "grad_norm": 0.47683125734329224, "learning_rate": 0.0002, "epoch": 0.022413149047441166, "step": 30}, {"loss": 1.8908, "grad_norm": 0.515082597732544, "learning_rate": 0.0002, "epoch": 0.029884198729921554, "step": 40}, {"loss": 1.9704, "grad_norm": 0.5299215316772461, "learning_rate": 0.0002, "epoch": 0.03735524841240194, "step": 50}, {"loss": 1.9225, "grad_norm": 0.4951399862766266, "learning_rate": 0.0002, "epoch": 0.04482629809488233, "step": 60}, {"loss": 1.9742, "grad_norm": 0.48079821467399597, "learning_rate": 0.0002, "epoch": 0.05229734777736272, "step": 70}, {"loss": 1.9466, "grad_norm": 0.49402132630348206, "learning_rate": 0.0002, "epoch": 0.05976839745984311, "step": 80}, {"loss": 1.8691, "grad_norm": 0.4778193235397339, "learning_rate": 0.0002, "epoch": 0.0672394471423235, "step": 90}, {"loss": 1.8455, "grad_norm": 0.42472657561302185, "learning_rate": 0.0002, "epoch": 0.07471049682480388, "step": 100}, {"loss": 1.8744, "grad_norm": 0.4433092474937439, "learning_rate": 0.0002, "epoch": 0.08218154650728428, "step": 110}, {"loss": 1.865, "grad_norm": 0.4472862780094147, "learning_rate": 0.0002, "epoch": 0.08965259618976466, "step": 120}, {"loss": 1.9256, "grad_norm": 0.42596298456192017, "learning_rate": 0.0002, "epoch": 0.09712364587224505, "step": 130}, {"loss": 1.8015, "grad_norm": 0.46645811200141907, "learning_rate": 0.0002, "epoch": 0.10459469555472543, "step": 140}, {"loss": 1.8307, "grad_norm": 0.41041234135627747, "learning_rate": 0.0002, "epoch": 0.11206574523720583, "step": 150}, {"loss": 1.8276, "grad_norm": 0.5329819917678833, "learning_rate": 0.0002, "epoch": 0.11953679491968622, "step": 160}, {"loss": 1.8118, "grad_norm": 0.4065922200679779, "learning_rate": 0.0002, "epoch": 0.1270078446021666, "step": 170}, {"loss": 1.8559, "grad_norm": 0.38406994938850403, "learning_rate": 0.0002, "epoch": 0.134478894284647, "step": 180}, {"loss": 1.8647, "grad_norm": 0.4246881306171417, "learning_rate": 0.0002, "epoch": 0.14194994396712737, "step": 190}, {"loss": 1.8054, "grad_norm": 0.35136649012565613, "learning_rate": 0.0002, "epoch": 0.14942099364960776, "step": 200}, {"loss": 1.802, "grad_norm": 0.43252742290496826, "learning_rate": 0.0002, "epoch": 0.15689204333208817, "step": 210}, {"loss": 1.7823, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002, "epoch": 0.16436309301456856, "step": 220}, {"loss": 1.818, "grad_norm": 0.3748249113559723, "learning_rate": 0.0002, "epoch": 0.17183414269704894, "step": 230}, {"loss": 1.866, "grad_norm": 0.6432855725288391, "learning_rate": 0.0002, "epoch": 0.17930519237952933, "step": 240}, {"loss": 1.8397, "grad_norm": 0.34874802827835083, "learning_rate": 0.0002, "epoch": 0.1867762420620097, "step": 250}, {"loss": 1.79, "grad_norm": 0.3721984326839447, "learning_rate": 0.0002, "epoch": 0.1942472917444901, "step": 260}, {"loss": 1.8464, "grad_norm": 0.4339311420917511, "learning_rate": 0.0002, "epoch": 0.20171834142697048, "step": 270}, {"loss": 1.8665, "grad_norm": 0.4018215537071228, "learning_rate": 0.0002, "epoch": 0.20918939110945087, "step": 280}, {"loss": 1.8048, "grad_norm": 0.3278839886188507, "learning_rate": 0.0002, "epoch": 0.21666044079193125, "step": 290}, {"loss": 1.7395, "grad_norm": 0.36146077513694763, "learning_rate": 0.0002, "epoch": 0.22413149047441167, "step": 300}, {"loss": 1.7916, "grad_norm": 0.38175010681152344, "learning_rate": 0.0002, "epoch": 0.23160254015689205, "step": 310}, {"loss": 1.8593, "grad_norm": 0.44776618480682373, "learning_rate": 0.0002, "epoch": 0.23907358983937244, "step": 320}, {"loss": 1.7824, "grad_norm": 0.3933652937412262, "learning_rate": 0.0002, "epoch": 0.24654463952185282, "step": 330}, {"loss": 1.8393, "grad_norm": 0.3515005111694336, "learning_rate": 0.0002, "epoch": 0.2540156892043332, "step": 340}, {"loss": 1.8653, "grad_norm": 0.6683304309844971, "learning_rate": 0.0002, "epoch": 0.2614867388868136, "step": 350}, {"loss": 1.8797, "grad_norm": 0.37093454599380493, "learning_rate": 0.0002, "epoch": 0.268957788569294, "step": 360}, {"loss": 1.8251, "grad_norm": 0.3450651168823242, "learning_rate": 0.0002, "epoch": 0.2764288382517744, "step": 370}, {"loss": 1.7435, "grad_norm": 0.5140917301177979, "learning_rate": 0.0002, "epoch": 0.28389988793425475, "step": 380}, {"loss": 1.8026, "grad_norm": 0.32885563373565674, "learning_rate": 0.0002, "epoch": 0.29137093761673516, "step": 390}, {"loss": 1.8174, "grad_norm": 0.33962297439575195, "learning_rate": 0.0002, "epoch": 0.2988419872992155, "step": 400}, {"loss": 1.7467, "grad_norm": 0.3723141849040985, "learning_rate": 0.0002, "epoch": 0.30631303698169593, "step": 410}, {"loss": 1.8459, "grad_norm": 0.37173134088516235, "learning_rate": 0.0002, "epoch": 0.31378408666417634, "step": 420}, {"loss": 1.8876, "grad_norm": 0.33736956119537354, "learning_rate": 0.0002, "epoch": 0.3212551363466567, "step": 430}, {"loss": 1.8367, "grad_norm": 0.3602448105812073, "learning_rate": 0.0002, "epoch": 0.3287261860291371, "step": 440}, {"loss": 1.8058, "grad_norm": 0.3569699227809906, "learning_rate": 0.0002, "epoch": 0.33619723571161747, "step": 450}, {"loss": 1.8086, "grad_norm": 0.31009167432785034, "learning_rate": 0.0002, "epoch": 0.3436682853940979, "step": 460}, {"loss": 1.8876, "grad_norm": 0.5278693437576294, "learning_rate": 0.0002, "epoch": 0.35113933507657824, "step": 470}, {"loss": 1.8534, "grad_norm": 0.3587537109851837, "learning_rate": 0.0002, "epoch": 0.35861038475905865, "step": 480}, {"loss": 1.8046, "grad_norm": 0.3859670162200928, "learning_rate": 0.0002, "epoch": 0.366081434441539, "step": 490}, {"loss": 1.8287, "grad_norm": 0.395913690328598, "learning_rate": 0.0002, "epoch": 0.3735524841240194, "step": 500}, {"loss": 1.7619, "grad_norm": 0.35052940249443054, "learning_rate": 0.0002, "epoch": 0.38102353380649984, "step": 510}, {"loss": 1.7824, "grad_norm": 0.2979494333267212, "learning_rate": 0.0002, "epoch": 0.3884945834889802, "step": 520}, {"loss": 1.8641, "grad_norm": 0.3062683343887329, "learning_rate": 0.0002, "epoch": 0.3959656331714606, "step": 530}, {"loss": 1.7651, "grad_norm": 0.3172847330570221, "learning_rate": 0.0002, "epoch": 0.40343668285394096, "step": 540}, {"loss": 1.806, "grad_norm": 0.360435426235199, "learning_rate": 0.0002, "epoch": 0.4109077325364214, "step": 550}, {"loss": 1.9054, "grad_norm": 0.3427872359752655, "learning_rate": 0.0002, "epoch": 0.41837878221890173, "step": 560}, {"loss": 1.7562, "grad_norm": 0.34036558866500854, "learning_rate": 0.0002, "epoch": 0.42584983190138215, "step": 570}, {"loss": 1.7254, "grad_norm": 0.3365345299243927, "learning_rate": 0.0002, "epoch": 0.4333208815838625, "step": 580}, {"loss": 1.8328, "grad_norm": 0.35619041323661804, "learning_rate": 0.0002, "epoch": 0.4407919312663429, "step": 590}, {"loss": 1.8114, "grad_norm": 0.3569088280200958, "learning_rate": 0.0002, "epoch": 0.44826298094882333, "step": 600}, {"loss": 1.8599, "grad_norm": 0.3581278622150421, "learning_rate": 0.0002, "epoch": 0.4557340306313037, "step": 610}, {"loss": 1.7078, "grad_norm": 0.43197110295295715, "learning_rate": 0.0002, "epoch": 0.4632050803137841, "step": 620}, {"loss": 1.8257, "grad_norm": 0.33966198563575745, "learning_rate": 0.0002, "epoch": 0.47067612999626446, "step": 630}, {"loss": 1.7528, "grad_norm": 0.3343866467475891, "learning_rate": 0.0002, "epoch": 0.47814717967874487, "step": 640}, {"loss": 1.8191, "grad_norm": 0.33878564834594727, "learning_rate": 0.0002, "epoch": 0.48561822936122523, "step": 650}, {"loss": 1.8801, "grad_norm": 0.387195885181427, "learning_rate": 0.0002, "epoch": 0.49308927904370564, "step": 660}, {"loss": 1.7559, "grad_norm": 0.3755440413951874, "learning_rate": 0.0002, "epoch": 0.500560328726186, "step": 670}, {"loss": 1.8057, "grad_norm": 0.3272816836833954, "learning_rate": 0.0002, "epoch": 0.5080313784086664, "step": 680}, {"loss": 1.8156, "grad_norm": 0.36063864827156067, "learning_rate": 0.0002, "epoch": 0.5155024280911468, "step": 690}, {"loss": 1.8397, "grad_norm": 0.35317373275756836, "learning_rate": 0.0002, "epoch": 0.5229734777736272, "step": 700}, {"loss": 1.7603, "grad_norm": 0.3561195433139801, "learning_rate": 0.0002, "epoch": 0.5304445274561076, "step": 710}, {"loss": 1.8149, "grad_norm": 0.31124624609947205, "learning_rate": 0.0002, "epoch": 0.537915577138588, "step": 720}, {"loss": 1.7434, "grad_norm": 0.3294544517993927, "learning_rate": 0.0002, "epoch": 0.5453866268210683, "step": 730}, {"loss": 1.8027, "grad_norm": 0.31933900713920593, "learning_rate": 0.0002, "epoch": 0.5528576765035488, "step": 740}, {"loss": 1.7601, "grad_norm": 0.3226020634174347, "learning_rate": 0.0002, "epoch": 0.5603287261860291, "step": 750}, {"loss": 1.7862, "grad_norm": 0.3147525489330292, "learning_rate": 0.0002, "epoch": 0.5677997758685095, "step": 760}, {"loss": 1.9028, "grad_norm": 0.32234328985214233, "learning_rate": 0.0002, "epoch": 0.57527082555099, "step": 770}, {"loss": 1.7623, "grad_norm": 0.3258664309978485, "learning_rate": 0.0002, "epoch": 0.5827418752334703, "step": 780}, {"loss": 1.7384, "grad_norm": 0.3166961967945099, "learning_rate": 0.0002, "epoch": 0.5902129249159507, "step": 790}, {"loss": 1.8799, "grad_norm": 0.35621458292007446, "learning_rate": 0.0002, "epoch": 0.597683974598431, "step": 800}, {"loss": 1.8313, "grad_norm": 0.3236999213695526, "learning_rate": 0.0002, "epoch": 0.6051550242809115, "step": 810}, {"loss": 1.7132, "grad_norm": 0.2892923653125763, "learning_rate": 0.0002, "epoch": 0.6126260739633919, "step": 820}, {"loss": 1.8709, "grad_norm": 0.4098321497440338, "learning_rate": 0.0002, "epoch": 0.6200971236458722, "step": 830}, {"loss": 1.7637, "grad_norm": 0.3337118923664093, "learning_rate": 0.0002, "epoch": 0.6275681733283527, "step": 840}, {"loss": 1.7375, "grad_norm": 0.30416029691696167, "learning_rate": 0.0002, "epoch": 0.635039223010833, "step": 850}, {"loss": 1.7419, "grad_norm": 0.3361026346683502, "learning_rate": 0.0002, "epoch": 0.6425102726933134, "step": 860}, {"loss": 1.732, "grad_norm": 0.3537365198135376, "learning_rate": 0.0002, "epoch": 0.6499813223757938, "step": 870}, {"loss": 1.7825, "grad_norm": 0.33854469656944275, "learning_rate": 0.0002, "epoch": 0.6574523720582742, "step": 880}, {"loss": 1.7561, "grad_norm": 0.3332272469997406, "learning_rate": 0.0002, "epoch": 0.6649234217407546, "step": 890}, {"loss": 1.7247, "grad_norm": 0.34954726696014404, "learning_rate": 0.0002, "epoch": 0.6723944714232349, "step": 900}, {"loss": 1.7917, "grad_norm": 0.2921750247478485, "learning_rate": 0.0002, "epoch": 0.6798655211057153, "step": 910}, {"loss": 1.7807, "grad_norm": 0.30508682131767273, "learning_rate": 0.0002, "epoch": 0.6873365707881958, "step": 920}, {"loss": 1.8082, "grad_norm": 0.32268425822257996, "learning_rate": 0.0002, "epoch": 0.6948076204706761, "step": 930}, {"loss": 1.8283, "grad_norm": 0.2844390869140625, "learning_rate": 0.0002, "epoch": 0.7022786701531565, "step": 940}, {"loss": 1.7363, "grad_norm": 0.31263890862464905, "learning_rate": 0.0002, "epoch": 0.709749719835637, "step": 950}, {"loss": 1.8081, "grad_norm": 0.3626808822154999, "learning_rate": 0.0002, "epoch": 0.7172207695181173, "step": 960}, {"loss": 1.853, "grad_norm": 0.3322749733924866, "learning_rate": 0.0002, "epoch": 0.7246918192005977, "step": 970}, {"loss": 1.7912, "grad_norm": 0.29177871346473694, "learning_rate": 0.0002, "epoch": 0.732162868883078, "step": 980}, {"loss": 1.8447, "grad_norm": 0.35405513644218445, "learning_rate": 0.0002, "epoch": 0.7396339185655585, "step": 990}, {"loss": 1.7008, "grad_norm": 0.39318400621414185, "learning_rate": 0.0002, "epoch": 0.7471049682480388, "step": 1000}, {"loss": 1.7803, "grad_norm": 0.29401418566703796, "learning_rate": 0.0002, "epoch": 0.7545760179305192, "step": 1010}, {"loss": 1.7649, "grad_norm": 0.3271748721599579, "learning_rate": 0.0002, "epoch": 0.7620470676129997, "step": 1020}, {"loss": 1.7266, "grad_norm": 0.30883970856666565, "learning_rate": 0.0002, "epoch": 0.76951811729548, "step": 1030}, {"loss": 1.7722, "grad_norm": 0.3411838412284851, "learning_rate": 0.0002, "epoch": 0.7769891669779604, "step": 1040}, {"loss": 1.829, "grad_norm": 0.30608129501342773, "learning_rate": 0.0002, "epoch": 0.7844602166604407, "step": 1050}, {"loss": 1.7815, "grad_norm": 0.30899080634117126, "learning_rate": 0.0002, "epoch": 0.7919312663429212, "step": 1060}, {"loss": 1.7625, "grad_norm": 0.3160453140735626, "learning_rate": 0.0002, "epoch": 0.7994023160254016, "step": 1070}, {"loss": 1.8452, "grad_norm": 0.30947187542915344, "learning_rate": 0.0002, "epoch": 0.8068733657078819, "step": 1080}, {"loss": 1.7418, "grad_norm": 0.3103134036064148, "learning_rate": 0.0002, "epoch": 0.8143444153903624, "step": 1090}, {"loss": 1.842, "grad_norm": 0.31771138310432434, "learning_rate": 0.0002, "epoch": 0.8218154650728428, "step": 1100}, {"loss": 1.7918, "grad_norm": 0.5860997438430786, "learning_rate": 0.0002, "epoch": 0.8292865147553231, "step": 1110}, {"loss": 1.8443, "grad_norm": 0.3230148255825043, "learning_rate": 0.0002, "epoch": 0.8367575644378035, "step": 1120}, {"loss": 1.8478, "grad_norm": 0.29611510038375854, "learning_rate": 0.0002, "epoch": 0.8442286141202839, "step": 1130}, {"loss": 1.7673, "grad_norm": 0.3373654782772064, "learning_rate": 0.0002, "epoch": 0.8516996638027643, "step": 1140}, {"loss": 1.7997, "grad_norm": 0.3474279046058655, "learning_rate": 0.0002, "epoch": 0.8591707134852447, "step": 1150}, {"loss": 1.75, "grad_norm": 0.35057875514030457, "learning_rate": 0.0002, "epoch": 0.866641763167725, "step": 1160}, {"loss": 1.8273, "grad_norm": 0.39537495374679565, "learning_rate": 0.0002, "epoch": 0.8741128128502055, "step": 1170}, {"loss": 1.7682, "grad_norm": 0.3714233636856079, "learning_rate": 0.0002, "epoch": 0.8815838625326858, "step": 1180}, {"loss": 1.7549, "grad_norm": 0.2950296998023987, "learning_rate": 0.0002, "epoch": 0.8890549122151662, "step": 1190}, {"loss": 1.7612, "grad_norm": 0.38182979822158813, "learning_rate": 0.0002, "epoch": 0.8965259618976467, "step": 1200}, {"loss": 1.827, "grad_norm": 0.27883678674697876, "learning_rate": 0.0002, "epoch": 0.903997011580127, "step": 1210}, {"loss": 1.7623, "grad_norm": 0.33874374628067017, "learning_rate": 0.0002, "epoch": 0.9114680612626074, "step": 1220}, {"loss": 1.7334, "grad_norm": 0.3014272153377533, "learning_rate": 0.0002, "epoch": 0.9189391109450877, "step": 1230}, {"loss": 1.8235, "grad_norm": 0.3194271922111511, "learning_rate": 0.0002, "epoch": 0.9264101606275682, "step": 1240}, {"loss": 1.7924, "grad_norm": 0.3049403429031372, "learning_rate": 0.0002, "epoch": 0.9338812103100486, "step": 1250}, {"loss": 1.7535, "grad_norm": 0.30621254444122314, "learning_rate": 0.0002, "epoch": 0.9413522599925289, "step": 1260}, {"loss": 1.8287, "grad_norm": 0.28675132989883423, "learning_rate": 0.0002, "epoch": 0.9488233096750094, "step": 1270}, {"loss": 1.7586, "grad_norm": 0.3322032690048218, "learning_rate": 0.0002, "epoch": 0.9562943593574897, "step": 1280}, {"loss": 1.8054, "grad_norm": 0.35408294200897217, "learning_rate": 0.0002, "epoch": 0.9637654090399701, "step": 1290}, {"loss": 1.7343, "grad_norm": 0.36386919021606445, "learning_rate": 0.0002, "epoch": 0.9712364587224505, "step": 1300}, {"loss": 1.8633, "grad_norm": 0.32338324189186096, "learning_rate": 0.0002, "epoch": 0.9787075084049309, "step": 1310}, {"loss": 1.7724, "grad_norm": 0.3714013993740082, "learning_rate": 0.0002, "epoch": 0.9861785580874113, "step": 1320}, {"loss": 1.7766, "grad_norm": 0.3133082389831543, "learning_rate": 0.0002, "epoch": 0.9936496077698916, "step": 1330}, {"eval_loss": 1.8051470518112183, "eval_runtime": 38.6332, "eval_samples_per_second": 13.331, "eval_steps_per_second": 1.682, "epoch": 0.9996264475158759, "step": 1338}, {"loss": 1.8035, "grad_norm": 0.31595754623413086, "learning_rate": 0.0002, "epoch": 1.001120657452372, "step": 1340}, {"loss": 1.7486, "grad_norm": 0.3095700144767761, "learning_rate": 0.0002, "epoch": 1.0085917071348525, "step": 1350}, {"loss": 1.6981, "grad_norm": 0.34677496552467346, "learning_rate": 0.0002, "epoch": 1.0160627568173328, "step": 1360}, {"loss": 1.7377, "grad_norm": 0.29108840227127075, "learning_rate": 0.0002, "epoch": 1.0235338064998132, "step": 1370}, {"loss": 1.7194, "grad_norm": 0.32356950640678406, "learning_rate": 0.0002, "epoch": 1.0310048561822935, "step": 1380}, {"loss": 1.7593, "grad_norm": 0.4200669229030609, "learning_rate": 0.0002, "epoch": 1.038475905864774, "step": 1390}, {"loss": 1.797, "grad_norm": 0.3283711373806, "learning_rate": 0.0002, "epoch": 1.0459469555472545, "step": 1400}, {"loss": 1.7163, "grad_norm": 0.32898256182670593, "learning_rate": 0.0002, "epoch": 1.0534180052297348, "step": 1410}, {"loss": 1.7559, "grad_norm": 0.38790300488471985, "learning_rate": 0.0002, "epoch": 1.0608890549122152, "step": 1420}, {"loss": 1.6922, "grad_norm": 0.339800089597702, "learning_rate": 0.0002, "epoch": 1.0683601045946955, "step": 1430}, {"loss": 1.7076, "grad_norm": 0.3548751175403595, "learning_rate": 0.0002, "epoch": 1.075831154277176, "step": 1440}, {"loss": 1.6985, "grad_norm": 0.35114359855651855, "learning_rate": 0.0002, "epoch": 1.0833022039596563, "step": 1450}, {"loss": 1.7217, "grad_norm": 0.35226720571517944, "learning_rate": 0.0002, "epoch": 1.0907732536421366, "step": 1460}, {"loss": 1.6822, "grad_norm": 0.33665576577186584, "learning_rate": 0.0002, "epoch": 1.0982443033246172, "step": 1470}, {"loss": 1.6699, "grad_norm": 0.363889217376709, "learning_rate": 0.0002, "epoch": 1.1057153530070976, "step": 1480}, {"loss": 1.7933, "grad_norm": 0.3826201856136322, "learning_rate": 0.0002, "epoch": 1.113186402689578, "step": 1490}, {"loss": 1.7022, "grad_norm": 0.34058740735054016, "learning_rate": 0.0002, "epoch": 1.1206574523720583, "step": 1500}, {"loss": 1.6375, "grad_norm": 0.3462134301662445, "learning_rate": 0.0002, "epoch": 1.1281285020545386, "step": 1510}, {"loss": 1.7147, "grad_norm": 0.3396756052970886, "learning_rate": 0.0002, "epoch": 1.135599551737019, "step": 1520}, {"loss": 1.7219, "grad_norm": 0.32004743814468384, "learning_rate": 0.0002, "epoch": 1.1430706014194993, "step": 1530}, {"loss": 1.743, "grad_norm": 0.3397733271121979, "learning_rate": 0.0002, "epoch": 1.15054165110198, "step": 1540}, {"loss": 1.7333, "grad_norm": 0.3783262073993683, "learning_rate": 0.0002, "epoch": 1.1580127007844603, "step": 1550}, {"loss": 1.6075, "grad_norm": 0.35121291875839233, "learning_rate": 0.0002, "epoch": 1.1654837504669406, "step": 1560}, {"loss": 1.678, "grad_norm": 0.35816895961761475, "learning_rate": 0.0002, "epoch": 1.172954800149421, "step": 1570}, {"loss": 1.7143, "grad_norm": 0.33843839168548584, "learning_rate": 0.0002, "epoch": 1.1804258498319014, "step": 1580}, {"loss": 1.7434, "grad_norm": 0.3371972143650055, "learning_rate": 0.0002, "epoch": 1.1878968995143817, "step": 1590}, {"loss": 1.7671, "grad_norm": 0.36016878485679626, "learning_rate": 0.0002, "epoch": 1.195367949196862, "step": 1600}, {"loss": 1.6914, "grad_norm": 0.40879473090171814, "learning_rate": 0.0002, "epoch": 1.2028389988793426, "step": 1610}, {"loss": 1.6955, "grad_norm": 0.3216715455055237, "learning_rate": 0.0002, "epoch": 1.210310048561823, "step": 1620}, {"loss": 1.632, "grad_norm": 0.4482610821723938, "learning_rate": 0.0002, "epoch": 1.2177810982443034, "step": 1630}, {"loss": 1.6999, "grad_norm": 0.3257700502872467, "learning_rate": 0.0002, "epoch": 1.2252521479267837, "step": 1640}, {"loss": 1.7177, "grad_norm": 0.38646459579467773, "learning_rate": 0.0002, "epoch": 1.232723197609264, "step": 1650}, {"loss": 1.7081, "grad_norm": 0.4081360697746277, "learning_rate": 0.0002, "epoch": 1.2401942472917444, "step": 1660}, {"loss": 1.7519, "grad_norm": 0.4326848089694977, "learning_rate": 0.0002, "epoch": 1.2476652969742248, "step": 1670}, {"loss": 1.6752, "grad_norm": 0.346401572227478, "learning_rate": 0.0002, "epoch": 1.2551363466567054, "step": 1680}, {"loss": 1.7425, "grad_norm": 0.34536251425743103, "learning_rate": 0.0002, "epoch": 1.2626073963391857, "step": 1690}, {"loss": 1.7061, "grad_norm": 0.41359591484069824, "learning_rate": 0.0002, "epoch": 1.270078446021666, "step": 1700}, {"loss": 1.7906, "grad_norm": 0.3530874252319336, "learning_rate": 0.0002, "epoch": 1.2775494957041464, "step": 1710}, {"loss": 1.7357, "grad_norm": 0.3702719211578369, "learning_rate": 0.0002, "epoch": 1.2850205453866268, "step": 1720}, {"loss": 1.766, "grad_norm": 0.3703329563140869, "learning_rate": 0.0002, "epoch": 1.2924915950691072, "step": 1730}, {"loss": 1.7221, "grad_norm": 0.37919729948043823, "learning_rate": 0.0002, "epoch": 1.2999626447515875, "step": 1740}, {"loss": 1.7859, "grad_norm": 0.32526856660842896, "learning_rate": 0.0002, "epoch": 1.307433694434068, "step": 1750}, {"loss": 1.7117, "grad_norm": 0.36752620339393616, "learning_rate": 0.0002, "epoch": 1.3149047441165485, "step": 1760}, {"loss": 1.7335, "grad_norm": 0.3398192524909973, "learning_rate": 0.0002, "epoch": 1.3223757937990288, "step": 1770}, {"loss": 1.7492, "grad_norm": 0.37435585260391235, "learning_rate": 0.0002, "epoch": 1.3298468434815092, "step": 1780}, {"loss": 1.7393, "grad_norm": 0.35793280601501465, "learning_rate": 0.0002, "epoch": 1.3373178931639895, "step": 1790}, {"loss": 1.7266, "grad_norm": 0.35481882095336914, "learning_rate": 0.0002, "epoch": 1.3447889428464699, "step": 1800}, {"loss": 1.7456, "grad_norm": 0.3786393105983734, "learning_rate": 0.0002, "epoch": 1.3522599925289502, "step": 1810}, {"loss": 1.7169, "grad_norm": 0.33245593309402466, "learning_rate": 0.0002, "epoch": 1.3597310422114308, "step": 1820}, {"loss": 1.7577, "grad_norm": 0.35388344526290894, "learning_rate": 0.0002, "epoch": 1.3672020918939112, "step": 1830}, {"loss": 1.6968, "grad_norm": 0.3695325553417206, "learning_rate": 0.0002, "epoch": 1.3746731415763915, "step": 1840}, {"loss": 1.7086, "grad_norm": 0.3683604598045349, "learning_rate": 0.0002, "epoch": 1.382144191258872, "step": 1850}, {"loss": 1.7878, "grad_norm": 0.3753012418746948, "learning_rate": 0.0002, "epoch": 1.3896152409413522, "step": 1860}, {"loss": 1.6969, "grad_norm": 0.3331069350242615, "learning_rate": 0.0002, "epoch": 1.3970862906238326, "step": 1870}, {"loss": 1.6644, "grad_norm": 0.3877500295639038, "learning_rate": 0.0002, "epoch": 1.404557340306313, "step": 1880}, {"loss": 1.7586, "grad_norm": 0.33525151014328003, "learning_rate": 0.0002, "epoch": 1.4120283899887935, "step": 1890}, {"loss": 1.7031, "grad_norm": 0.3697299659252167, "learning_rate": 0.0002, "epoch": 1.4194994396712737, "step": 1900}, {"loss": 1.6956, "grad_norm": 0.4029286205768585, "learning_rate": 0.0002, "epoch": 1.4269704893537543, "step": 1910}, {"loss": 1.6897, "grad_norm": 0.3596203029155731, "learning_rate": 0.0002, "epoch": 1.4344415390362346, "step": 1920}, {"loss": 1.7139, "grad_norm": 0.450783908367157, "learning_rate": 0.0002, "epoch": 1.441912588718715, "step": 1930}, {"loss": 1.7243, "grad_norm": 0.3651481866836548, "learning_rate": 0.0002, "epoch": 1.4493836384011953, "step": 1940}, {"loss": 1.6637, "grad_norm": 0.3608424663543701, "learning_rate": 0.0002, "epoch": 1.4568546880836757, "step": 1950}, {"loss": 1.8285, "grad_norm": 0.39684420824050903, "learning_rate": 0.0002, "epoch": 1.4643257377661563, "step": 1960}, {"loss": 1.7514, "grad_norm": 0.34618663787841797, "learning_rate": 0.0002, "epoch": 1.4717967874486364, "step": 1970}, {"loss": 1.6655, "grad_norm": 0.4150386452674866, "learning_rate": 0.0002, "epoch": 1.479267837131117, "step": 1980}, {"loss": 1.7021, "grad_norm": 0.35500776767730713, "learning_rate": 0.0002, "epoch": 1.4867388868135973, "step": 1990}, {"loss": 1.7322, "grad_norm": 0.344144344329834, "learning_rate": 0.0002, "epoch": 1.4942099364960777, "step": 2000}, {"loss": 1.6998, "grad_norm": 0.3340149223804474, "learning_rate": 0.0002, "epoch": 1.501680986178558, "step": 2010}, {"loss": 1.7508, "grad_norm": 0.37685006856918335, "learning_rate": 0.0002, "epoch": 1.5091520358610384, "step": 2020}, {"loss": 1.8299, "grad_norm": 0.3699876368045807, "learning_rate": 0.0002, "epoch": 1.516623085543519, "step": 2030}, {"loss": 1.7357, "grad_norm": 0.3370307385921478, "learning_rate": 0.0002, "epoch": 1.5240941352259991, "step": 2040}, {"loss": 1.8044, "grad_norm": 0.37780630588531494, "learning_rate": 0.0002, "epoch": 1.5315651849084797, "step": 2050}, {"loss": 1.7408, "grad_norm": 0.370259165763855, "learning_rate": 0.0002, "epoch": 1.53903623459096, "step": 2060}, {"loss": 1.7398, "grad_norm": 0.3440011441707611, "learning_rate": 0.0002, "epoch": 1.5465072842734404, "step": 2070}, {"loss": 1.7105, "grad_norm": 0.40382063388824463, "learning_rate": 0.0002, "epoch": 1.5539783339559208, "step": 2080}, {"loss": 1.7071, "grad_norm": 0.38002029061317444, "learning_rate": 0.0002, "epoch": 1.5614493836384011, "step": 2090}, {"loss": 1.6815, "grad_norm": 0.3658451437950134, "learning_rate": 0.0002, "epoch": 1.5689204333208817, "step": 2100}, {"loss": 1.7598, "grad_norm": 0.354842871427536, "learning_rate": 0.0002, "epoch": 1.5763914830033618, "step": 2110}, {"loss": 1.6898, "grad_norm": 0.34735530614852905, "learning_rate": 0.0002, "epoch": 1.5838625326858424, "step": 2120}, {"loss": 1.7363, "grad_norm": 0.377581924200058, "learning_rate": 0.0002, "epoch": 1.5913335823683228, "step": 2130}, {"loss": 1.7789, "grad_norm": 0.41254034638404846, "learning_rate": 0.0002, "epoch": 1.5988046320508031, "step": 2140}, {"loss": 1.6782, "grad_norm": 0.3630715310573578, "learning_rate": 0.0002, "epoch": 1.6062756817332835, "step": 2150}, {"loss": 1.7531, "grad_norm": 0.36980143189430237, "learning_rate": 0.0002, "epoch": 1.6137467314157639, "step": 2160}, {"loss": 1.6847, "grad_norm": 0.3634769320487976, "learning_rate": 0.0002, "epoch": 1.6212177810982444, "step": 2170}, {"loss": 1.6367, "grad_norm": 0.3794139623641968, "learning_rate": 0.0002, "epoch": 1.6286888307807246, "step": 2180}, {"loss": 1.7064, "grad_norm": 0.359742134809494, "learning_rate": 0.0002, "epoch": 1.6361598804632052, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3770543932914734, "learning_rate": 0.0002, "epoch": 1.6436309301456855, "step": 2200}, {"loss": 1.784, "grad_norm": 0.3797036409378052, "learning_rate": 0.0002, "epoch": 1.6511019798281659, "step": 2210}, {"loss": 1.7875, "grad_norm": 0.35622093081474304, "learning_rate": 0.0002, "epoch": 1.6585730295106462, "step": 2220}, {"loss": 1.6615, "grad_norm": 0.34552520513534546, "learning_rate": 0.0002, "epoch": 1.6660440791931266, "step": 2230}, {"loss": 1.7522, "grad_norm": 0.379926860332489, "learning_rate": 0.0002, "epoch": 1.6735151288756072, "step": 2240}, {"loss": 1.7953, "grad_norm": 0.37083810567855835, "learning_rate": 0.0002, "epoch": 1.6809861785580873, "step": 2250}, {"loss": 1.7485, "grad_norm": 0.42746543884277344, "learning_rate": 0.0002, "epoch": 1.6884572282405679, "step": 2260}, {"loss": 1.776, "grad_norm": 0.3372884690761566, "learning_rate": 0.0002, "epoch": 1.6959282779230482, "step": 2270}, {"loss": 1.7604, "grad_norm": 0.35220256447792053, "learning_rate": 0.0002, "epoch": 1.7033993276055286, "step": 2280}, {"loss": 1.7154, "grad_norm": 0.3659130930900574, "learning_rate": 0.0002, "epoch": 1.710870377288009, "step": 2290}, {"loss": 1.6953, "grad_norm": 0.37629297375679016, "learning_rate": 0.0002, "epoch": 1.7183414269704893, "step": 2300}, {"loss": 1.7212, "grad_norm": 0.36312398314476013, "learning_rate": 0.0002, "epoch": 1.7258124766529699, "step": 2310}, {"loss": 1.7903, "grad_norm": 0.467709481716156, "learning_rate": 0.0002, "epoch": 1.73328352633545, "step": 2320}, {"loss": 1.696, "grad_norm": 0.38685527443885803, "learning_rate": 0.0002, "epoch": 1.7407545760179306, "step": 2330}, {"loss": 1.7041, "grad_norm": 0.3578338325023651, "learning_rate": 0.0002, "epoch": 1.748225625700411, "step": 2340}, {"loss": 1.6456, "grad_norm": 0.36057502031326294, "learning_rate": 0.0002, "epoch": 1.7556966753828913, "step": 2350}, {"loss": 1.6853, "grad_norm": 0.3615196645259857, "learning_rate": 0.0002, "epoch": 1.7631677250653717, "step": 2360}, {"loss": 1.7612, "grad_norm": 0.4118947684764862, "learning_rate": 0.0002, "epoch": 1.770638774747852, "step": 2370}, {"loss": 1.6946, "grad_norm": 0.4067276120185852, "learning_rate": 0.0002, "epoch": 1.7781098244303326, "step": 2380}, {"loss": 1.712, "grad_norm": 0.3979823887348175, "learning_rate": 0.0002, "epoch": 1.7855808741128127, "step": 2390}, {"loss": 1.7644, "grad_norm": 0.44045883417129517, "learning_rate": 0.0002, "epoch": 1.7930519237952933, "step": 2400}, {"loss": 1.7251, "grad_norm": 0.3998069167137146, "learning_rate": 0.0002, "epoch": 1.8005229734777737, "step": 2410}, {"loss": 1.7354, "grad_norm": 0.3450094759464264, "learning_rate": 0.0002, "epoch": 1.807994023160254, "step": 2420}, {"loss": 1.6998, "grad_norm": 0.3759009838104248, "learning_rate": 0.0002, "epoch": 1.8154650728427344, "step": 2430}, {"loss": 1.7706, "grad_norm": 0.34347015619277954, "learning_rate": 0.0002, "epoch": 1.8229361225252148, "step": 2440}, {"loss": 1.7345, "grad_norm": 0.3511228859424591, "learning_rate": 0.0002, "epoch": 1.8304071722076953, "step": 2450}, {"loss": 1.6909, "grad_norm": 0.36853715777397156, "learning_rate": 0.0002, "epoch": 1.8378782218901755, "step": 2460}, {"loss": 1.6931, "grad_norm": 0.40659376978874207, "learning_rate": 0.0002, "epoch": 1.845349271572656, "step": 2470}, {"loss": 1.7626, "grad_norm": 0.39621320366859436, "learning_rate": 0.0002, "epoch": 1.8528203212551362, "step": 2480}, {"loss": 1.7427, "grad_norm": 0.3753979504108429, "learning_rate": 0.0002, "epoch": 1.8602913709376168, "step": 2490}, {"loss": 1.6622, "grad_norm": 0.3811938464641571, "learning_rate": 0.0002, "epoch": 1.8677624206200971, "step": 2500}, {"loss": 1.7718, "grad_norm": 0.3432596027851105, "learning_rate": 0.0002, "epoch": 1.8752334703025775, "step": 2510}, {"loss": 1.7488, "grad_norm": 0.3670712113380432, "learning_rate": 0.0002, "epoch": 1.882704519985058, "step": 2520}, {"loss": 1.705, "grad_norm": 0.40907177329063416, "learning_rate": 0.0002, "epoch": 1.8901755696675382, "step": 2530}, {"loss": 1.7148, "grad_norm": 0.3821999728679657, "learning_rate": 0.0002, "epoch": 1.8976466193500188, "step": 2540}, {"loss": 1.7934, "grad_norm": 0.36173978447914124, "learning_rate": 0.0002, "epoch": 1.905117669032499, "step": 2550}, {"loss": 1.6939, "grad_norm": 0.38990336656570435, "learning_rate": 0.0002, "epoch": 1.9125887187149795, "step": 2560}, {"loss": 1.6893, "grad_norm": 0.35242322087287903, "learning_rate": 0.0002, "epoch": 1.9200597683974598, "step": 2570}, {"loss": 1.7268, "grad_norm": 0.3506428003311157, "learning_rate": 0.0002, "epoch": 1.9275308180799402, "step": 2580}, {"loss": 1.6953, "grad_norm": 0.39540135860443115, "learning_rate": 0.0002, "epoch": 1.9350018677624208, "step": 2590}, {"loss": 1.6511, "grad_norm": 0.3444725573062897, "learning_rate": 0.0002, "epoch": 1.942472917444901, "step": 2600}, {"loss": 1.7259, "grad_norm": 0.3963521718978882, "learning_rate": 0.0002, "epoch": 1.9499439671273815, "step": 2610}, {"loss": 1.6946, "grad_norm": 0.3689815402030945, "learning_rate": 0.0002, "epoch": 1.9574150168098616, "step": 2620}, {"loss": 1.7384, "grad_norm": 0.3482626676559448, "learning_rate": 0.0002, "epoch": 1.9648860664923422, "step": 2630}, {"loss": 1.7048, "grad_norm": 0.35832616686820984, "learning_rate": 0.0002, "epoch": 1.9723571161748226, "step": 2640}, {"loss": 1.6681, "grad_norm": 0.4776208996772766, "learning_rate": 0.0002, "epoch": 1.979828165857303, "step": 2650}, {"loss": 1.6696, "grad_norm": 0.32570165395736694, "learning_rate": 0.0002, "epoch": 1.9872992155397835, "step": 2660}, {"loss": 1.7232, "grad_norm": 0.3380725085735321, "learning_rate": 0.0002, "epoch": 1.9947702652222636, "step": 2670}, {"eval_loss": 1.8046749830245972, "eval_runtime": 38.5096, "eval_samples_per_second": 13.373, "eval_steps_per_second": 1.688, "epoch": 2.0, "step": 2677}, {"loss": 1.7265, "grad_norm": 0.36817631125450134, "learning_rate": 0.0002, "epoch": 2.002241314904744, "step": 2680}, {"loss": 1.548, "grad_norm": 0.4056456685066223, "learning_rate": 0.0002, "epoch": 2.0097123645872244, "step": 2690}, {"loss": 1.5515, "grad_norm": 0.37416863441467285, "learning_rate": 0.0002, "epoch": 2.017183414269705, "step": 2700}, {"loss": 1.5895, "grad_norm": 0.4273638427257538, "learning_rate": 0.0002, "epoch": 2.024654463952185, "step": 2710}, {"loss": 1.5884, "grad_norm": 0.36497923731803894, "learning_rate": 0.0002, "epoch": 2.0321255136346656, "step": 2720}, {"loss": 1.6999, "grad_norm": 0.5021994113922119, "learning_rate": 0.0002, "epoch": 2.0395965633171462, "step": 2730}, {"loss": 1.6655, "grad_norm": 0.45896220207214355, "learning_rate": 0.0002, "epoch": 2.0470676129996264, "step": 2740}, {"loss": 1.6305, "grad_norm": 0.3973815143108368, "learning_rate": 0.0002, "epoch": 2.054538662682107, "step": 2750}, {"loss": 1.6301, "grad_norm": 0.4521815776824951, "learning_rate": 0.0002, "epoch": 2.062009712364587, "step": 2760}, {"loss": 1.6189, "grad_norm": 0.42775002121925354, "learning_rate": 0.0002, "epoch": 2.0694807620470677, "step": 2770}, {"loss": 1.6491, "grad_norm": 0.48158586025238037, "learning_rate": 0.0002, "epoch": 2.076951811729548, "step": 2780}, {"loss": 1.6301, "grad_norm": 0.4612371623516083, "learning_rate": 0.0002, "epoch": 2.0844228614120284, "step": 2790}, {"loss": 1.6327, "grad_norm": 0.42536866664886475, "learning_rate": 0.0002, "epoch": 2.091893911094509, "step": 2800}, {"loss": 1.651, "grad_norm": 0.48515772819519043, "learning_rate": 0.0002, "epoch": 2.099364960776989, "step": 2810}, {"loss": 1.6829, "grad_norm": 0.41418662667274475, "learning_rate": 0.0002, "epoch": 2.1068360104594697, "step": 2820}, {"loss": 1.6266, "grad_norm": 0.4683697819709778, "learning_rate": 0.0002, "epoch": 2.11430706014195, "step": 2830}, {"loss": 1.6586, "grad_norm": 0.4484657049179077, "learning_rate": 0.0002, "epoch": 2.1217781098244304, "step": 2840}, {"loss": 1.6483, "grad_norm": 0.6621400713920593, "learning_rate": 0.0002, "epoch": 2.1292491595069105, "step": 2850}, {"loss": 1.5755, "grad_norm": 0.45074811577796936, "learning_rate": 0.0002, "epoch": 2.136720209189391, "step": 2860}, {"loss": 1.6456, "grad_norm": 0.3513113558292389, "learning_rate": 0.0002, "epoch": 2.1441912588718717, "step": 2870}, {"loss": 1.6081, "grad_norm": 0.40411314368247986, "learning_rate": 0.0002, "epoch": 2.151662308554352, "step": 2880}, {"loss": 1.6323, "grad_norm": 0.4121065139770508, "learning_rate": 0.0002, "epoch": 2.1591333582368324, "step": 2890}, {"loss": 1.6324, "grad_norm": 0.44723689556121826, "learning_rate": 0.0002, "epoch": 2.1666044079193125, "step": 2900}, {"loss": 1.5699, "grad_norm": 0.4226122498512268, "learning_rate": 0.0002, "epoch": 2.174075457601793, "step": 2910}, {"loss": 1.5652, "grad_norm": 0.46617650985717773, "learning_rate": 0.0002, "epoch": 2.1815465072842732, "step": 2920}, {"loss": 1.6378, "grad_norm": 0.4506422281265259, "learning_rate": 0.0002, "epoch": 2.189017556966754, "step": 2930}, {"loss": 1.6112, "grad_norm": 0.4892672896385193, "learning_rate": 0.0002, "epoch": 2.1964886066492344, "step": 2940}, {"loss": 1.6176, "grad_norm": 0.44095516204833984, "learning_rate": 0.0002, "epoch": 2.2039596563317145, "step": 2950}, {"loss": 1.6058, "grad_norm": 0.41522109508514404, "learning_rate": 0.0002, "epoch": 2.211430706014195, "step": 2960}, {"loss": 1.5964, "grad_norm": 0.4860858917236328, "learning_rate": 0.0002, "epoch": 2.2189017556966752, "step": 2970}, {"loss": 1.6427, "grad_norm": 0.42662516236305237, "learning_rate": 0.0002, "epoch": 2.226372805379156, "step": 2980}, {"loss": 1.6313, "grad_norm": 0.4390648305416107, "learning_rate": 0.0002, "epoch": 2.233843855061636, "step": 2990}, {"loss": 1.5992, "grad_norm": 0.47515565156936646, "learning_rate": 0.0002, "epoch": 2.2413149047441165, "step": 3000}, {"loss": 1.5563, "grad_norm": 0.4104543924331665, "learning_rate": 0.0002, "epoch": 2.248785954426597, "step": 3010}, {"loss": 1.6895, "grad_norm": 0.4404028654098511, "learning_rate": 0.0002, "epoch": 2.2562570041090773, "step": 3020}, {"loss": 1.6088, "grad_norm": 0.4717366695404053, "learning_rate": 0.0002, "epoch": 2.263728053791558, "step": 3030}, {"loss": 1.7287, "grad_norm": 0.48345857858657837, "learning_rate": 0.0002, "epoch": 2.271199103474038, "step": 3040}, {"loss": 1.681, "grad_norm": 0.5312452912330627, "learning_rate": 0.0002, "epoch": 2.2786701531565186, "step": 3050}, {"loss": 1.5901, "grad_norm": 0.5073099732398987, "learning_rate": 0.0002, "epoch": 2.2861412028389987, "step": 3060}, {"loss": 1.6914, "grad_norm": 0.5027463436126709, "learning_rate": 0.0002, "epoch": 2.2936122525214793, "step": 3070}, {"loss": 1.5862, "grad_norm": 0.5436304807662964, "learning_rate": 0.0002, "epoch": 2.30108330220396, "step": 3080}, {"loss": 1.5763, "grad_norm": 0.4701065123081207, "learning_rate": 0.0002, "epoch": 2.30855435188644, "step": 3090}, {"loss": 1.6177, "grad_norm": 0.46988746523857117, "learning_rate": 0.0002, "epoch": 2.3160254015689206, "step": 3100}, {"loss": 1.6502, "grad_norm": 0.45112869143486023, "learning_rate": 0.0002, "epoch": 2.3234964512514007, "step": 3110}, {"loss": 1.6291, "grad_norm": 0.5173566937446594, "learning_rate": 0.0002, "epoch": 2.3309675009338813, "step": 3120}, {"loss": 1.6743, "grad_norm": 0.40345850586891174, "learning_rate": 0.0002, "epoch": 2.3384385506163614, "step": 3130}, {"loss": 1.621, "grad_norm": 0.4218924939632416, "learning_rate": 0.0002, "epoch": 2.345909600298842, "step": 3140}, {"loss": 1.6341, "grad_norm": 0.41857317090034485, "learning_rate": 0.0002, "epoch": 2.3533806499813226, "step": 3150}, {"loss": 1.6087, "grad_norm": 0.4197218418121338, "learning_rate": 0.0002, "epoch": 2.3608516996638027, "step": 3160}, {"loss": 1.6572, "grad_norm": 0.4260677397251129, "learning_rate": 0.0002, "epoch": 2.3683227493462833, "step": 3170}, {"loss": 1.6376, "grad_norm": 0.4209042191505432, "learning_rate": 0.0002, "epoch": 2.3757937990287634, "step": 3180}, {"loss": 1.634, "grad_norm": 0.4092234969139099, "learning_rate": 0.0002, "epoch": 2.383264848711244, "step": 3190}, {"loss": 1.6339, "grad_norm": 0.4928431510925293, "learning_rate": 0.0002, "epoch": 2.390735898393724, "step": 3200}, {"loss": 1.6015, "grad_norm": 0.49252402782440186, "learning_rate": 0.0002, "epoch": 2.3982069480762047, "step": 3210}, {"loss": 1.5773, "grad_norm": 0.4368397295475006, "learning_rate": 0.0002, "epoch": 2.4056779977586853, "step": 3220}, {"loss": 1.6629, "grad_norm": 0.46122390031814575, "learning_rate": 0.0002, "epoch": 2.4131490474411654, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4272301197052002, "learning_rate": 0.0002, "epoch": 2.420620097123646, "step": 3240}, {"loss": 1.5961, "grad_norm": 0.41480937600135803, "learning_rate": 0.0002, "epoch": 2.428091146806126, "step": 3250}, {"loss": 1.6281, "grad_norm": 0.48911941051483154, "learning_rate": 0.0002, "epoch": 2.4355621964886067, "step": 3260}, {"loss": 1.6846, "grad_norm": 0.4444098472595215, "learning_rate": 0.0002, "epoch": 2.443033246171087, "step": 3270}, {"loss": 1.6961, "grad_norm": 0.5111684799194336, "learning_rate": 0.0002, "epoch": 2.4505042958535674, "step": 3280}, {"loss": 1.6152, "grad_norm": 0.5058825016021729, "learning_rate": 0.0002, "epoch": 2.457975345536048, "step": 3290}, {"loss": 1.625, "grad_norm": 0.44173210859298706, "learning_rate": 0.0002, "epoch": 2.465446395218528, "step": 3300}, {"loss": 1.6491, "grad_norm": 0.4659745991230011, "learning_rate": 0.0002, "epoch": 2.4729174449010087, "step": 3310}, {"loss": 1.6114, "grad_norm": 0.47237497568130493, "learning_rate": 0.0002, "epoch": 2.480388494583489, "step": 3320}, {"loss": 1.6193, "grad_norm": 0.47303131222724915, "learning_rate": 0.0002, "epoch": 2.4878595442659694, "step": 3330}, {"loss": 1.7256, "grad_norm": 0.4522389769554138, "learning_rate": 0.0002, "epoch": 2.4953305939484496, "step": 3340}, {"loss": 1.6834, "grad_norm": 0.4467332363128662, "learning_rate": 0.0002, "epoch": 2.50280164363093, "step": 3350}, {"loss": 1.6108, "grad_norm": 0.4413762092590332, "learning_rate": 0.0002, "epoch": 2.5102726933134107, "step": 3360}, {"loss": 1.537, "grad_norm": 0.495514452457428, "learning_rate": 0.0002, "epoch": 2.517743742995891, "step": 3370}, {"loss": 1.5839, "grad_norm": 0.4429773986339569, "learning_rate": 0.0002, "epoch": 2.5252147926783715, "step": 3380}, {"loss": 1.6522, "grad_norm": 0.4589079022407532, "learning_rate": 0.0002, "epoch": 2.5326858423608516, "step": 3390}, {"loss": 1.6529, "grad_norm": 0.4683997333049774, "learning_rate": 0.0002, "epoch": 2.540156892043332, "step": 3400}, {"loss": 1.6745, "grad_norm": 0.4651731252670288, "learning_rate": 0.0002, "epoch": 2.5476279417258123, "step": 3410}, {"loss": 1.5918, "grad_norm": 0.45818084478378296, "learning_rate": 0.0002, "epoch": 2.555098991408293, "step": 3420}, {"loss": 1.6326, "grad_norm": 0.45209529995918274, "learning_rate": 0.0002, "epoch": 2.5625700410907735, "step": 3430}, {"loss": 1.5606, "grad_norm": 0.4344733655452728, "learning_rate": 0.0002, "epoch": 2.5700410907732536, "step": 3440}, {"loss": 1.6748, "grad_norm": 0.47435566782951355, "learning_rate": 0.0002, "epoch": 2.577512140455734, "step": 3450}, {"loss": 1.6237, "grad_norm": 0.43841999769210815, "learning_rate": 0.0002, "epoch": 2.5849831901382143, "step": 3460}, {"loss": 1.7207, "grad_norm": 0.4323869049549103, "learning_rate": 0.0002, "epoch": 2.592454239820695, "step": 3470}, {"loss": 1.5494, "grad_norm": 0.44355881214141846, "learning_rate": 0.0002, "epoch": 2.599925289503175, "step": 3480}, {"loss": 1.665, "grad_norm": 0.45847779512405396, "learning_rate": 0.0002, "epoch": 2.6073963391856556, "step": 3490}, {"loss": 1.6006, "grad_norm": 0.4411061704158783, "learning_rate": 0.0002, "epoch": 2.614867388868136, "step": 3500}, {"loss": 1.5868, "grad_norm": 0.4446796178817749, "learning_rate": 0.0002, "epoch": 2.6223384385506163, "step": 3510}, {"loss": 1.5946, "grad_norm": 0.41969653964042664, "learning_rate": 0.0002, "epoch": 2.629809488233097, "step": 3520}, {"loss": 1.6798, "grad_norm": 0.5263747572898865, "learning_rate": 0.0002, "epoch": 2.637280537915577, "step": 3530}, {"loss": 1.6309, "grad_norm": 0.47719451785087585, "learning_rate": 0.0002, "epoch": 2.6447515875980576, "step": 3540}, {"loss": 1.7024, "grad_norm": 0.46574118733406067, "learning_rate": 0.0002, "epoch": 2.6522226372805378, "step": 3550}, {"loss": 1.618, "grad_norm": 0.46867135167121887, "learning_rate": 0.0002, "epoch": 2.6596936869630183, "step": 3560}, {"loss": 1.5885, "grad_norm": 0.4441198706626892, "learning_rate": 0.0002, "epoch": 2.667164736645499, "step": 3570}, {"loss": 1.6426, "grad_norm": 0.4871319830417633, "learning_rate": 0.0002, "epoch": 2.674635786327979, "step": 3580}, {"loss": 1.6575, "grad_norm": 0.43900373578071594, "learning_rate": 0.0002, "epoch": 2.6821068360104596, "step": 3590}, {"loss": 1.6071, "grad_norm": 0.42509549856185913, "learning_rate": 0.0002, "epoch": 2.6895778856929398, "step": 3600}, {"loss": 1.5651, "grad_norm": 0.4691086709499359, "learning_rate": 0.0002, "epoch": 2.6970489353754203, "step": 3610}, {"loss": 1.5491, "grad_norm": 0.46318942308425903, "learning_rate": 0.0002, "epoch": 2.7045199850579005, "step": 3620}, {"loss": 1.5422, "grad_norm": 0.44631096720695496, "learning_rate": 0.0002, "epoch": 2.711991034740381, "step": 3630}, {"loss": 1.6831, "grad_norm": 0.42315489053726196, "learning_rate": 0.0002, "epoch": 2.7194620844228616, "step": 3640}, {"loss": 1.6008, "grad_norm": 0.4971241056919098, "learning_rate": 0.0002, "epoch": 2.7269331341053418, "step": 3650}, {"loss": 1.6042, "grad_norm": 0.4578486382961273, "learning_rate": 0.0002, "epoch": 2.7344041837878224, "step": 3660}, {"loss": 1.6076, "grad_norm": 0.46584776043891907, "learning_rate": 0.0002, "epoch": 2.7418752334703025, "step": 3670}, {"loss": 1.6809, "grad_norm": 0.4951731264591217, "learning_rate": 0.0002, "epoch": 2.749346283152783, "step": 3680}, {"loss": 1.6226, "grad_norm": 0.4935225546360016, "learning_rate": 0.0002, "epoch": 2.756817332835263, "step": 3690}, {"loss": 1.5878, "grad_norm": 0.41805586218833923, "learning_rate": 0.0002, "epoch": 2.764288382517744, "step": 3700}, {"loss": 1.7173, "grad_norm": 0.4417555630207062, "learning_rate": 0.0002, "epoch": 2.7717594322002244, "step": 3710}, {"loss": 1.6398, "grad_norm": 0.48229655623435974, "learning_rate": 0.0002, "epoch": 2.7792304818827045, "step": 3720}, {"loss": 1.6074, "grad_norm": 0.48562315106391907, "learning_rate": 0.0002, "epoch": 2.786701531565185, "step": 3730}, {"loss": 1.607, "grad_norm": 0.4473940432071686, "learning_rate": 0.0002, "epoch": 2.794172581247665, "step": 3740}, {"loss": 1.6065, "grad_norm": 0.4626813232898712, "learning_rate": 0.0002, "epoch": 2.801643630930146, "step": 3750}, {"loss": 1.6296, "grad_norm": 0.4339792728424072, "learning_rate": 0.0002, "epoch": 2.809114680612626, "step": 3760}, {"loss": 1.6815, "grad_norm": 0.5250858068466187, "learning_rate": 0.0002, "epoch": 2.8165857302951065, "step": 3770}, {"loss": 1.6644, "grad_norm": 0.4537523090839386, "learning_rate": 0.0002, "epoch": 2.824056779977587, "step": 3780}, {"loss": 1.6535, "grad_norm": 0.5646113157272339, "learning_rate": 0.0002, "epoch": 2.831527829660067, "step": 3790}, {"loss": 1.5712, "grad_norm": 0.44243332743644714, "learning_rate": 0.0002, "epoch": 2.8389988793425474, "step": 3800}, {"loss": 1.6478, "grad_norm": 0.4585791826248169, "learning_rate": 0.0002, "epoch": 2.846469929025028, "step": 3810}, {"loss": 1.6854, "grad_norm": 0.489702045917511, "learning_rate": 0.0002, "epoch": 2.8539409787075085, "step": 3820}, {"loss": 1.7066, "grad_norm": 0.502470850944519, "learning_rate": 0.0002, "epoch": 2.8614120283899886, "step": 3830}, {"loss": 1.5785, "grad_norm": 0.4395960867404938, "learning_rate": 0.0002, "epoch": 2.8688830780724692, "step": 3840}, {"loss": 1.6434, "grad_norm": 0.4348670244216919, "learning_rate": 0.0002, "epoch": 2.87635412775495, "step": 3850}, {"loss": 1.6163, "grad_norm": 0.48852720856666565, "learning_rate": 0.0002, "epoch": 2.88382517743743, "step": 3860}, {"loss": 1.5916, "grad_norm": 0.45317450165748596, "learning_rate": 0.0002, "epoch": 2.89129622711991, "step": 3870}, {"loss": 1.6486, "grad_norm": 0.4732758700847626, "learning_rate": 0.0002, "epoch": 2.8987672768023907, "step": 3880}, {"loss": 1.6758, "grad_norm": 0.45238012075424194, "learning_rate": 0.0002, "epoch": 2.9062383264848712, "step": 3890}, {"loss": 1.6228, "grad_norm": 0.48838064074516296, "learning_rate": 0.0002, "epoch": 2.9137093761673514, "step": 3900}, {"loss": 1.658, "grad_norm": 0.43496349453926086, "learning_rate": 0.0002, "epoch": 2.921180425849832, "step": 3910}, {"loss": 1.7063, "grad_norm": 0.47963935136795044, "learning_rate": 0.0002, "epoch": 2.9286514755323125, "step": 3920}, {"loss": 1.6553, "grad_norm": 0.4544987976551056, "learning_rate": 0.0002, "epoch": 2.9361225252147927, "step": 3930}, {"loss": 1.6192, "grad_norm": 0.4622892141342163, "learning_rate": 0.0002, "epoch": 2.943593574897273, "step": 3940}, {"loss": 1.6178, "grad_norm": 0.47026222944259644, "learning_rate": 0.0002, "epoch": 2.9510646245797534, "step": 3950}, {"loss": 1.6612, "grad_norm": 0.4549552798271179, "learning_rate": 0.0002, "epoch": 2.958535674262234, "step": 3960}, {"loss": 1.6458, "grad_norm": 0.46647515892982483, "learning_rate": 0.0002, "epoch": 2.966006723944714, "step": 3970}, {"loss": 1.6051, "grad_norm": 0.45095112919807434, "learning_rate": 0.0002, "epoch": 2.9734777736271947, "step": 3980}, {"loss": 1.6471, "grad_norm": 0.4690017104148865, "learning_rate": 0.0002, "epoch": 2.9809488233096753, "step": 3990}, {"loss": 1.6061, "grad_norm": 0.4603444039821625, "learning_rate": 0.0002, "epoch": 2.9884198729921554, "step": 4000}, {"loss": 1.6431, "grad_norm": 0.4743294417858124, "learning_rate": 0.0002, "epoch": 2.9958909226746355, "step": 4010}, {"eval_loss": 1.8252571821212769, "eval_runtime": 38.7853, "eval_samples_per_second": 13.278, "eval_steps_per_second": 1.676, "epoch": 2.999626447515876, "step": 4015}, {"loss": 1.6512, "grad_norm": 0.4919724464416504, "learning_rate": 0.0002, "epoch": 3.003361972357116, "step": 4020}, {"loss": 1.5354, "grad_norm": 0.4747185707092285, "learning_rate": 0.0002, "epoch": 3.0108330220395967, "step": 4030}, {"loss": 1.568, "grad_norm": 0.4797595143318176, "learning_rate": 0.0002, "epoch": 3.018304071722077, "step": 4040}, {"loss": 1.5194, "grad_norm": 0.5450999140739441, "learning_rate": 0.0002, "epoch": 3.0257751214045574, "step": 4050}, {"loss": 1.5065, "grad_norm": 0.49058812856674194, "learning_rate": 0.0002, "epoch": 3.0332461710870375, "step": 4060}, {"loss": 1.4884, "grad_norm": 0.5219563841819763, "learning_rate": 0.0002, "epoch": 3.040717220769518, "step": 4070}, {"loss": 1.4742, "grad_norm": 0.515628457069397, "learning_rate": 0.0002, "epoch": 3.0481882704519987, "step": 4080}, {"loss": 1.5313, "grad_norm": 0.6145984530448914, "learning_rate": 0.0002, "epoch": 3.055659320134479, "step": 4090}, {"loss": 1.4989, "grad_norm": 0.6067144274711609, "learning_rate": 0.0002, "epoch": 3.0631303698169594, "step": 4100}, {"loss": 1.528, "grad_norm": 0.5773133039474487, "learning_rate": 0.0002, "epoch": 3.0706014194994395, "step": 4110}, {"loss": 1.5374, "grad_norm": 0.6894241571426392, "learning_rate": 0.0002, "epoch": 3.07807246918192, "step": 4120}, {"loss": 1.5422, "grad_norm": 0.6422514915466309, "learning_rate": 0.0002, "epoch": 3.0855435188644003, "step": 4130}, {"loss": 1.4724, "grad_norm": 0.6119855046272278, "learning_rate": 0.0002, "epoch": 3.093014568546881, "step": 4140}, {"loss": 1.5361, "grad_norm": 0.5847280025482178, "learning_rate": 0.0002, "epoch": 3.1004856182293614, "step": 4150}, {"loss": 1.5151, "grad_norm": 0.5401515960693359, "learning_rate": 0.0002, "epoch": 3.1079566679118416, "step": 4160}, {"loss": 1.502, "grad_norm": 0.6501587629318237, "learning_rate": 0.0002, "epoch": 3.115427717594322, "step": 4170}, {"loss": 1.4952, "grad_norm": 0.5988039374351501, "learning_rate": 0.0002, "epoch": 3.1228987672768023, "step": 4180}, {"loss": 1.5287, "grad_norm": 0.4982665181159973, "learning_rate": 0.0002, "epoch": 3.130369816959283, "step": 4190}, {"loss": 1.5078, "grad_norm": 0.5548039078712463, "learning_rate": 0.0002, "epoch": 3.137840866641763, "step": 4200}, {"loss": 1.4904, "grad_norm": 0.5920777320861816, "learning_rate": 0.0002, "epoch": 3.1453119163242436, "step": 4210}, {"loss": 1.442, "grad_norm": 0.6965190172195435, "learning_rate": 0.0002, "epoch": 3.152782966006724, "step": 4220}, {"loss": 1.557, "grad_norm": 0.5196244716644287, "learning_rate": 0.0002, "epoch": 3.1602540156892043, "step": 4230}, {"loss": 1.5706, "grad_norm": 0.6942682266235352, "learning_rate": 0.0002, "epoch": 3.167725065371685, "step": 4240}, {"loss": 1.5407, "grad_norm": 0.5765156149864197, "learning_rate": 0.0002, "epoch": 3.175196115054165, "step": 4250}, {"loss": 1.4963, "grad_norm": 0.5801976919174194, "learning_rate": 0.0002, "epoch": 3.1826671647366456, "step": 4260}, {"loss": 1.4988, "grad_norm": 0.6260752081871033, "learning_rate": 0.0002, "epoch": 3.1901382144191257, "step": 4270}, {"loss": 1.5074, "grad_norm": 0.6610770225524902, "learning_rate": 0.0002, "epoch": 3.1976092641016063, "step": 4280}, {"loss": 1.4657, "grad_norm": 0.5762143135070801, "learning_rate": 0.0002, "epoch": 3.205080313784087, "step": 4290}, {"loss": 1.5181, "grad_norm": 0.5926990509033203, "learning_rate": 0.0002, "epoch": 3.212551363466567, "step": 4300}, {"loss": 1.5492, "grad_norm": 0.7373854517936707, "learning_rate": 0.0002, "epoch": 3.2200224131490476, "step": 4310}, {"loss": 1.4648, "grad_norm": 0.5963311195373535, "learning_rate": 0.0002, "epoch": 3.2274934628315277, "step": 4320}, {"loss": 1.5262, "grad_norm": 0.5754616856575012, "learning_rate": 0.0002, "epoch": 3.2349645125140083, "step": 4330}, {"loss": 1.4767, "grad_norm": 0.6116095781326294, "learning_rate": 0.0002, "epoch": 3.2424355621964884, "step": 4340}, {"loss": 1.5008, "grad_norm": 0.6001536846160889, "learning_rate": 0.0002, "epoch": 3.249906611878969, "step": 4350}, {"loss": 1.5738, "grad_norm": 0.5270227789878845, "learning_rate": 0.0002, "epoch": 3.257377661561449, "step": 4360}, {"loss": 1.5235, "grad_norm": 0.6666602492332458, "learning_rate": 0.0002, "epoch": 3.2648487112439297, "step": 4370}, {"loss": 1.5665, "grad_norm": 0.520310640335083, "learning_rate": 0.0002, "epoch": 3.2723197609264103, "step": 4380}, {"loss": 1.542, "grad_norm": 0.5165975093841553, "learning_rate": 0.0002, "epoch": 3.2797908106088904, "step": 4390}, {"loss": 1.4746, "grad_norm": 0.6080228686332703, "learning_rate": 0.0002, "epoch": 3.287261860291371, "step": 4400}, {"loss": 1.4901, "grad_norm": 0.670122504234314, "learning_rate": 0.0002, "epoch": 3.294732909973851, "step": 4410}, {"loss": 1.4677, "grad_norm": 0.6019457578659058, "learning_rate": 0.0002, "epoch": 3.3022039596563317, "step": 4420}, {"loss": 1.4249, "grad_norm": 0.5519300103187561, "learning_rate": 0.0002, "epoch": 3.309675009338812, "step": 4430}, {"loss": 1.555, "grad_norm": 0.5958521962165833, "learning_rate": 0.0002, "epoch": 3.3171460590212924, "step": 4440}, {"loss": 1.5067, "grad_norm": 0.5552705526351929, "learning_rate": 0.0002, "epoch": 3.324617108703773, "step": 4450}, {"loss": 1.5926, "grad_norm": 0.6583784818649292, "learning_rate": 0.0002, "epoch": 3.332088158386253, "step": 4460}, {"loss": 1.4206, "grad_norm": 0.5815939903259277, "learning_rate": 0.0002, "epoch": 3.3395592080687337, "step": 4470}, {"loss": 1.5942, "grad_norm": 1.3342205286026, "learning_rate": 0.0002, "epoch": 3.347030257751214, "step": 4480}, {"loss": 1.484, "grad_norm": 0.6341500878334045, "learning_rate": 0.0002, "epoch": 3.3545013074336945, "step": 4490}, {"loss": 1.5219, "grad_norm": 0.6384079456329346, "learning_rate": 0.0002, "epoch": 3.3619723571161746, "step": 4500}, {"loss": 1.5222, "grad_norm": 0.6098346710205078, "learning_rate": 0.0002, "epoch": 3.369443406798655, "step": 4510}, {"loss": 1.5475, "grad_norm": 0.5958296656608582, "learning_rate": 0.0002, "epoch": 3.3769144564811358, "step": 4520}, {"loss": 1.5171, "grad_norm": 0.6157881617546082, "learning_rate": 0.0002, "epoch": 3.384385506163616, "step": 4530}, {"loss": 1.569, "grad_norm": 0.5671007037162781, "learning_rate": 0.0002, "epoch": 3.3918565558460965, "step": 4540}, {"loss": 1.604, "grad_norm": 0.6203294992446899, "learning_rate": 0.0002, "epoch": 3.3993276055285766, "step": 4550}, {"loss": 1.5364, "grad_norm": 0.6743317246437073, "learning_rate": 0.0002, "epoch": 3.406798655211057, "step": 4560}, {"loss": 1.5034, "grad_norm": 0.731765627861023, "learning_rate": 0.0002, "epoch": 3.4142697048935373, "step": 4570}, {"loss": 1.4585, "grad_norm": 0.6285187602043152, "learning_rate": 0.0002, "epoch": 3.421740754576018, "step": 4580}, {"loss": 1.5296, "grad_norm": 0.612680196762085, "learning_rate": 0.0002, "epoch": 3.4292118042584985, "step": 4590}, {"loss": 1.5577, "grad_norm": 0.6413681507110596, "learning_rate": 0.0002, "epoch": 3.4366828539409786, "step": 4600}, {"loss": 1.5026, "grad_norm": 0.6240990161895752, "learning_rate": 0.0002, "epoch": 3.444153903623459, "step": 4610}, {"loss": 1.5887, "grad_norm": 0.5095735192298889, "learning_rate": 0.0002, "epoch": 3.4516249533059393, "step": 4620}, {"loss": 1.4906, "grad_norm": 0.5699611902236938, "learning_rate": 0.0002, "epoch": 3.45909600298842, "step": 4630}, {"loss": 1.5176, "grad_norm": 0.7289775609970093, "learning_rate": 0.0002, "epoch": 3.4665670526709, "step": 4640}, {"loss": 1.5467, "grad_norm": 0.6211609840393066, "learning_rate": 0.0002, "epoch": 3.4740381023533806, "step": 4650}, {"loss": 1.533, "grad_norm": 0.5714802145957947, "learning_rate": 0.0002, "epoch": 3.481509152035861, "step": 4660}, {"loss": 1.5096, "grad_norm": 0.6287049651145935, "learning_rate": 0.0002, "epoch": 3.4889802017183413, "step": 4670}, {"loss": 1.4212, "grad_norm": 0.5480595827102661, "learning_rate": 0.0002, "epoch": 3.496451251400822, "step": 4680}, {"loss": 1.4746, "grad_norm": 0.5683253407478333, "learning_rate": 0.0002, "epoch": 3.503922301083302, "step": 4690}, {"loss": 1.5012, "grad_norm": 0.601140558719635, "learning_rate": 0.0002, "epoch": 3.5113933507657826, "step": 4700}, {"loss": 1.5383, "grad_norm": 0.5344498157501221, "learning_rate": 0.0002, "epoch": 3.5188644004482628, "step": 4710}, {"loss": 1.5428, "grad_norm": 0.5739690661430359, "learning_rate": 0.0002, "epoch": 3.5263354501307433, "step": 4720}, {"loss": 1.5589, "grad_norm": 0.5640085935592651, "learning_rate": 0.0002, "epoch": 3.533806499813224, "step": 4730}, {"loss": 1.487, "grad_norm": 0.5967805981636047, "learning_rate": 0.0002, "epoch": 3.541277549495704, "step": 4740}, {"loss": 1.5461, "grad_norm": 0.6138835549354553, "learning_rate": 0.0002, "epoch": 3.5487485991781846, "step": 4750}, {"loss": 1.5502, "grad_norm": 0.6779900193214417, "learning_rate": 0.0002, "epoch": 3.5562196488606648, "step": 4760}, {"loss": 1.4917, "grad_norm": 0.6122010350227356, "learning_rate": 0.0002, "epoch": 3.5636906985431454, "step": 4770}, {"loss": 1.5405, "grad_norm": 0.5685241222381592, "learning_rate": 0.0002, "epoch": 3.5711617482256255, "step": 4780}, {"loss": 1.5427, "grad_norm": 0.604583203792572, "learning_rate": 0.0002, "epoch": 3.578632797908106, "step": 4790}, {"loss": 1.4514, "grad_norm": 0.651165246963501, "learning_rate": 0.0002, "epoch": 3.5861038475905866, "step": 4800}, {"loss": 1.4109, "grad_norm": 0.6398511528968811, "learning_rate": 0.0002, "epoch": 3.593574897273067, "step": 4810}, {"loss": 1.4261, "grad_norm": 0.6444641351699829, "learning_rate": 0.0002, "epoch": 3.6010459469555474, "step": 4820}, {"loss": 1.5274, "grad_norm": 0.6018481850624084, "learning_rate": 0.0002, "epoch": 3.6085169966380275, "step": 4830}, {"loss": 1.4647, "grad_norm": 0.6025291085243225, "learning_rate": 0.0002, "epoch": 3.615988046320508, "step": 4840}, {"loss": 1.5609, "grad_norm": 0.6810156106948853, "learning_rate": 0.0002, "epoch": 3.623459096002988, "step": 4850}, {"loss": 1.5299, "grad_norm": 0.6408044695854187, "learning_rate": 0.0002, "epoch": 3.630930145685469, "step": 4860}, {"loss": 1.5366, "grad_norm": 0.5608272552490234, "learning_rate": 0.0002, "epoch": 3.6384011953679494, "step": 4870}, {"loss": 1.5188, "grad_norm": 0.6136814951896667, "learning_rate": 0.0002, "epoch": 3.6458722450504295, "step": 4880}, {"loss": 1.5021, "grad_norm": 0.5927900075912476, "learning_rate": 0.0002, "epoch": 3.65334329473291, "step": 4890}, {"loss": 1.6084, "grad_norm": 0.5336901545524597, "learning_rate": 0.0002, "epoch": 3.66081434441539, "step": 4900}, {"loss": 1.5701, "grad_norm": 0.7823320627212524, "learning_rate": 0.0002, "epoch": 3.668285394097871, "step": 4910}, {"loss": 1.4881, "grad_norm": 0.6703504323959351, "learning_rate": 0.0002, "epoch": 3.675756443780351, "step": 4920}, {"loss": 1.5332, "grad_norm": 0.6061160564422607, "learning_rate": 0.0002, "epoch": 3.6832274934628315, "step": 4930}, {"loss": 1.5405, "grad_norm": 0.6237227916717529, "learning_rate": 0.0002, "epoch": 3.690698543145312, "step": 4940}, {"loss": 1.497, "grad_norm": 0.5985278487205505, "learning_rate": 0.0002, "epoch": 3.6981695928277922, "step": 4950}, {"loss": 1.5132, "grad_norm": 0.6483839750289917, "learning_rate": 0.0002, "epoch": 3.705640642510273, "step": 4960}, {"loss": 1.5338, "grad_norm": 0.5788805484771729, "learning_rate": 0.0002, "epoch": 3.713111692192753, "step": 4970}, {"loss": 1.5258, "grad_norm": 0.5609974265098572, "learning_rate": 0.0002, "epoch": 3.7205827418752335, "step": 4980}, {"loss": 1.4759, "grad_norm": 0.5681300759315491, "learning_rate": 0.0002, "epoch": 3.7280537915577137, "step": 4990}, {"loss": 1.6018, "grad_norm": 0.5860186219215393, "learning_rate": 0.0002, "epoch": 3.7355248412401942, "step": 5000}, {"loss": 1.58, "grad_norm": 0.5718157291412354, "learning_rate": 0.0002, "epoch": 3.742995890922675, "step": 5010}, {"loss": 1.5834, "grad_norm": 0.6173721551895142, "learning_rate": 0.0002, "epoch": 3.750466940605155, "step": 5020}, {"loss": 1.5617, "grad_norm": 0.629152238368988, "learning_rate": 0.0002, "epoch": 3.7579379902876355, "step": 5030}, {"loss": 1.519, "grad_norm": 0.5666284561157227, "learning_rate": 0.0002, "epoch": 3.7654090399701157, "step": 5040}, {"loss": 1.5329, "grad_norm": 0.6053005456924438, "learning_rate": 0.0002, "epoch": 3.7728800896525962, "step": 5050}, {"loss": 1.5404, "grad_norm": 0.5870583057403564, "learning_rate": 0.0002, "epoch": 3.7803511393350764, "step": 5060}, {"loss": 1.4444, "grad_norm": 0.5422009229660034, "learning_rate": 0.0002, "epoch": 3.787822189017557, "step": 5070}, {"loss": 1.5308, "grad_norm": 0.5396918058395386, "learning_rate": 0.0002, "epoch": 3.7952932387000375, "step": 5080}, {"loss": 1.464, "grad_norm": 0.5544713139533997, "learning_rate": 0.0002, "epoch": 3.8027642883825177, "step": 5090}, {"loss": 1.4752, "grad_norm": 0.5983749628067017, "learning_rate": 0.0002, "epoch": 3.8102353380649983, "step": 5100}, {"loss": 1.4972, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 3.8177063877474784, "step": 5110}, {"loss": 1.5471, "grad_norm": 0.5436882376670837, "learning_rate": 0.0002, "epoch": 3.825177437429959, "step": 5120}, {"loss": 1.5118, "grad_norm": 0.5453617572784424, "learning_rate": 0.0002, "epoch": 3.832648487112439, "step": 5130}, {"loss": 1.5732, "grad_norm": 0.6269069314002991, "learning_rate": 0.0002, "epoch": 3.8401195367949197, "step": 5140}, {"loss": 1.4959, "grad_norm": 0.6189185380935669, "learning_rate": 0.0002, "epoch": 3.8475905864774003, "step": 5150}, {"loss": 1.4999, "grad_norm": 0.6653388142585754, "learning_rate": 0.0002, "epoch": 3.8550616361598804, "step": 5160}, {"loss": 1.5075, "grad_norm": 0.5771768689155579, "learning_rate": 0.0002, "epoch": 3.862532685842361, "step": 5170}, {"loss": 1.5545, "grad_norm": 0.6052790880203247, "learning_rate": 0.0002, "epoch": 3.870003735524841, "step": 5180}, {"loss": 1.4987, "grad_norm": 0.6572316884994507, "learning_rate": 0.0002, "epoch": 3.8774747852073217, "step": 5190}, {"loss": 1.5241, "grad_norm": 0.670576810836792, "learning_rate": 0.0002, "epoch": 3.884945834889802, "step": 5200}, {"loss": 1.4777, "grad_norm": 0.5728798508644104, "learning_rate": 0.0002, "epoch": 3.8924168845722824, "step": 5210}, {"loss": 1.5351, "grad_norm": 0.6340774297714233, "learning_rate": 0.0002, "epoch": 3.899887934254763, "step": 5220}, {"loss": 1.5081, "grad_norm": 0.5981315970420837, "learning_rate": 0.0002, "epoch": 3.907358983937243, "step": 5230}, {"loss": 1.4875, "grad_norm": 0.6212025880813599, "learning_rate": 0.0002, "epoch": 3.9148300336197237, "step": 5240}, {"loss": 1.5545, "grad_norm": 0.6202296018600464, "learning_rate": 0.0002, "epoch": 3.922301083302204, "step": 5250}, {"loss": 1.5765, "grad_norm": 0.6159142255783081, "learning_rate": 0.0002, "epoch": 3.9297721329846844, "step": 5260}, {"loss": 1.4938, "grad_norm": 0.6519438624382019, "learning_rate": 0.0002, "epoch": 3.9372431826671646, "step": 5270}, {"loss": 1.4859, "grad_norm": 0.539813756942749, "learning_rate": 0.0002, "epoch": 3.944714232349645, "step": 5280}, {"loss": 1.5921, "grad_norm": 0.6443665027618408, "learning_rate": 0.0002, "epoch": 3.9521852820321257, "step": 5290}, {"loss": 1.5153, "grad_norm": 0.6635757684707642, "learning_rate": 0.0002, "epoch": 3.959656331714606, "step": 5300}, {"loss": 1.5485, "grad_norm": 0.589363157749176, "learning_rate": 0.0002, "epoch": 3.9671273813970864, "step": 5310}, {"loss": 1.5498, "grad_norm": 0.5788735747337341, "learning_rate": 0.0002, "epoch": 3.9745984310795666, "step": 5320}, {"loss": 1.5607, "grad_norm": 0.5976864695549011, "learning_rate": 0.0002, "epoch": 3.982069480762047, "step": 5330}, {"loss": 1.5302, "grad_norm": 0.6624067425727844, "learning_rate": 0.0002, "epoch": 3.9895405304445273, "step": 5340}, {"loss": 1.5904, "grad_norm": 0.6738956570625305, "learning_rate": 0.0002, "epoch": 3.997011580127008, "step": 5350}, {"eval_loss": 1.868006944656372, "eval_runtime": 38.5153, "eval_samples_per_second": 13.371, "eval_steps_per_second": 1.688, "epoch": 4.0, "step": 5354}, {"loss": 1.4535, "grad_norm": 0.6023468971252441, "learning_rate": 0.0002, "epoch": 4.004482629809488, "step": 5360}, {"loss": 1.3987, "grad_norm": 0.8589285612106323, "learning_rate": 0.0002, "epoch": 4.011953679491969, "step": 5370}, {"loss": 1.3952, "grad_norm": 0.7477491497993469, "learning_rate": 0.0002, "epoch": 4.019424729174449, "step": 5380}, {"loss": 1.3745, "grad_norm": 0.7601922154426575, "learning_rate": 0.0002, "epoch": 4.02689577885693, "step": 5390}, {"loss": 1.4133, "grad_norm": 0.8115614056587219, "learning_rate": 0.0002, "epoch": 4.03436682853941, "step": 5400}, {"loss": 1.3748, "grad_norm": 0.669925332069397, "learning_rate": 0.0002, "epoch": 4.04183787822189, "step": 5410}, {"loss": 1.2835, "grad_norm": 0.8091904520988464, "learning_rate": 0.0002, "epoch": 4.04930892790437, "step": 5420}, {"loss": 1.3615, "grad_norm": 0.709405779838562, "learning_rate": 0.0002, "epoch": 4.056779977586851, "step": 5430}, {"loss": 1.3558, "grad_norm": 1.0006179809570312, "learning_rate": 0.0002, "epoch": 4.064251027269331, "step": 5440}, {"loss": 1.3491, "grad_norm": 0.7017965912818909, "learning_rate": 0.0002, "epoch": 4.071722076951811, "step": 5450}, {"loss": 1.3642, "grad_norm": 0.8991572260856628, "learning_rate": 0.0002, "epoch": 4.0791931266342925, "step": 5460}, {"loss": 1.392, "grad_norm": 0.9064797759056091, "learning_rate": 0.0002, "epoch": 4.086664176316773, "step": 5470}, {"loss": 1.3425, "grad_norm": 0.7981749176979065, "learning_rate": 0.0002, "epoch": 4.094135225999253, "step": 5480}, {"loss": 1.3826, "grad_norm": 0.7280883193016052, "learning_rate": 0.0002, "epoch": 4.101606275681733, "step": 5490}, {"loss": 1.3275, "grad_norm": 0.7419600486755371, "learning_rate": 0.0002, "epoch": 4.109077325364214, "step": 5500}, {"loss": 1.3199, "grad_norm": 0.8019949197769165, "learning_rate": 0.0002, "epoch": 4.116548375046694, "step": 5510}, {"loss": 1.3133, "grad_norm": 0.7501229047775269, "learning_rate": 0.0002, "epoch": 4.124019424729174, "step": 5520}, {"loss": 1.4432, "grad_norm": 0.8166249990463257, "learning_rate": 0.0002, "epoch": 4.131490474411655, "step": 5530}, {"loss": 1.3901, "grad_norm": 0.9728496074676514, "learning_rate": 0.0002, "epoch": 4.138961524094135, "step": 5540}, {"loss": 1.3538, "grad_norm": 0.7590922117233276, "learning_rate": 0.0002, "epoch": 4.1464325737766154, "step": 5550}, {"loss": 1.4368, "grad_norm": 0.7759010791778564, "learning_rate": 0.0002, "epoch": 4.153903623459096, "step": 5560}, {"loss": 1.3635, "grad_norm": 0.9057986736297607, "learning_rate": 0.0002, "epoch": 4.161374673141577, "step": 5570}, {"loss": 1.4152, "grad_norm": 0.8853937983512878, "learning_rate": 0.0002, "epoch": 4.168845722824057, "step": 5580}, {"loss": 1.3633, "grad_norm": 0.7070684432983398, "learning_rate": 0.0002, "epoch": 4.176316772506537, "step": 5590}, {"loss": 1.3218, "grad_norm": 0.7649410963058472, "learning_rate": 0.0002, "epoch": 4.183787822189018, "step": 5600}, {"loss": 1.3857, "grad_norm": 1.2048029899597168, "learning_rate": 0.0002, "epoch": 4.191258871871498, "step": 5610}, {"loss": 1.3629, "grad_norm": 0.7986605763435364, "learning_rate": 0.0002, "epoch": 4.198729921553978, "step": 5620}, {"loss": 1.3995, "grad_norm": 0.8151885867118835, "learning_rate": 0.0002, "epoch": 4.206200971236458, "step": 5630}, {"loss": 1.3782, "grad_norm": 0.7719064354896545, "learning_rate": 0.0002, "epoch": 4.213672020918939, "step": 5640}, {"loss": 1.3852, "grad_norm": 0.8422448039054871, "learning_rate": 0.0002, "epoch": 4.2211430706014195, "step": 5650}, {"loss": 1.3321, "grad_norm": 0.7017164826393127, "learning_rate": 0.0002, "epoch": 4.2286141202839, "step": 5660}, {"loss": 1.4105, "grad_norm": 0.8559677600860596, "learning_rate": 0.0002, "epoch": 4.236085169966381, "step": 5670}, {"loss": 1.3701, "grad_norm": 0.8216157555580139, "learning_rate": 0.0002, "epoch": 4.243556219648861, "step": 5680}, {"loss": 1.3565, "grad_norm": 0.7681755423545837, "learning_rate": 0.0002, "epoch": 4.251027269331341, "step": 5690}, {"loss": 1.3806, "grad_norm": 0.811665952205658, "learning_rate": 0.0002, "epoch": 4.258498319013821, "step": 5700}, {"loss": 1.4161, "grad_norm": 0.7242204546928406, "learning_rate": 0.0002, "epoch": 4.265969368696302, "step": 5710}, {"loss": 1.2958, "grad_norm": 0.7570181488990784, "learning_rate": 0.0002, "epoch": 4.273440418378782, "step": 5720}, {"loss": 1.4265, "grad_norm": 0.8951969146728516, "learning_rate": 0.0002, "epoch": 4.280911468061262, "step": 5730}, {"loss": 1.3895, "grad_norm": 0.7222902178764343, "learning_rate": 0.0002, "epoch": 4.288382517743743, "step": 5740}, {"loss": 1.4155, "grad_norm": 0.8508469462394714, "learning_rate": 0.0002, "epoch": 4.2958535674262235, "step": 5750}, {"loss": 1.365, "grad_norm": 0.7215430736541748, "learning_rate": 0.0002, "epoch": 4.303324617108704, "step": 5760}, {"loss": 1.4472, "grad_norm": 0.8774884939193726, "learning_rate": 0.0002, "epoch": 4.310795666791184, "step": 5770}, {"loss": 1.427, "grad_norm": 0.8354552984237671, "learning_rate": 0.0002, "epoch": 4.318266716473665, "step": 5780}, {"loss": 1.3222, "grad_norm": 0.6938814520835876, "learning_rate": 0.0002, "epoch": 4.325737766156145, "step": 5790}, {"loss": 1.3589, "grad_norm": 0.78675377368927, "learning_rate": 0.0002, "epoch": 4.333208815838625, "step": 5800}, {"loss": 1.3662, "grad_norm": 0.7147697806358337, "learning_rate": 0.0002, "epoch": 4.340679865521106, "step": 5810}, {"loss": 1.3597, "grad_norm": 0.7693623304367065, "learning_rate": 0.0002, "epoch": 4.348150915203586, "step": 5820}, {"loss": 1.2944, "grad_norm": 0.856517493724823, "learning_rate": 0.0002, "epoch": 4.355621964886066, "step": 5830}, {"loss": 1.4307, "grad_norm": 0.7200973033905029, "learning_rate": 0.0002, "epoch": 4.3630930145685465, "step": 5840}, {"loss": 1.442, "grad_norm": 0.743281364440918, "learning_rate": 0.0002, "epoch": 4.3705640642510275, "step": 5850}, {"loss": 1.3999, "grad_norm": 0.7627727389335632, "learning_rate": 0.0002, "epoch": 4.378035113933508, "step": 5860}, {"loss": 1.4082, "grad_norm": 0.7238836884498596, "learning_rate": 0.0002, "epoch": 4.385506163615988, "step": 5870}, {"loss": 1.4292, "grad_norm": 0.7253410816192627, "learning_rate": 0.0002, "epoch": 4.392977213298469, "step": 5880}, {"loss": 1.3774, "grad_norm": 0.8232238292694092, "learning_rate": 0.0002, "epoch": 4.400448262980949, "step": 5890}, {"loss": 1.3757, "grad_norm": 0.8778504729270935, "learning_rate": 0.0002, "epoch": 4.407919312663429, "step": 5900}, {"loss": 1.387, "grad_norm": 0.7639474868774414, "learning_rate": 0.0002, "epoch": 4.415390362345909, "step": 5910}, {"loss": 1.3862, "grad_norm": 0.7666519284248352, "learning_rate": 0.0002, "epoch": 4.42286141202839, "step": 5920}, {"loss": 1.4168, "grad_norm": 0.867132842540741, "learning_rate": 0.0002, "epoch": 4.43033246171087, "step": 5930}, {"loss": 1.4772, "grad_norm": 0.7571166753768921, "learning_rate": 0.0002, "epoch": 4.4378035113933505, "step": 5940}, {"loss": 1.4401, "grad_norm": 0.7911370992660522, "learning_rate": 0.0002, "epoch": 4.4452745610758315, "step": 5950}, {"loss": 1.4516, "grad_norm": 0.8844250440597534, "learning_rate": 0.0002, "epoch": 4.452745610758312, "step": 5960}, {"loss": 1.4109, "grad_norm": 0.7336231470108032, "learning_rate": 0.0002, "epoch": 4.460216660440792, "step": 5970}, {"loss": 1.3891, "grad_norm": 0.8162738084793091, "learning_rate": 0.0002, "epoch": 4.467687710123272, "step": 5980}, {"loss": 1.393, "grad_norm": 0.7413017153739929, "learning_rate": 0.0002, "epoch": 4.475158759805753, "step": 5990}, {"loss": 1.3712, "grad_norm": 0.7215432524681091, "learning_rate": 0.0002, "epoch": 4.482629809488233, "step": 6000}, {"loss": 1.3521, "grad_norm": 0.8943389058113098, "learning_rate": 0.0002, "epoch": 4.490100859170713, "step": 6010}, {"loss": 1.4172, "grad_norm": 0.7850823998451233, "learning_rate": 0.0002, "epoch": 4.497571908853194, "step": 6020}, {"loss": 1.3582, "grad_norm": 0.8117504119873047, "learning_rate": 0.0002, "epoch": 4.505042958535674, "step": 6030}, {"loss": 1.4272, "grad_norm": 0.8381605744361877, "learning_rate": 0.0002, "epoch": 4.5125140082181545, "step": 6040}, {"loss": 1.3829, "grad_norm": 0.7964059710502625, "learning_rate": 0.0002, "epoch": 4.519985057900635, "step": 6050}, {"loss": 1.3555, "grad_norm": 0.7935128211975098, "learning_rate": 0.0002, "epoch": 4.527456107583116, "step": 6060}, {"loss": 1.3994, "grad_norm": 0.8725124597549438, "learning_rate": 0.0002, "epoch": 4.534927157265596, "step": 6070}, {"loss": 1.3923, "grad_norm": 0.880325198173523, "learning_rate": 0.0002, "epoch": 4.542398206948076, "step": 6080}, {"loss": 1.4459, "grad_norm": 0.7220637202262878, "learning_rate": 0.0002, "epoch": 4.549869256630557, "step": 6090}, {"loss": 1.3281, "grad_norm": 0.6908547878265381, "learning_rate": 0.0002, "epoch": 4.557340306313037, "step": 6100}, {"loss": 1.437, "grad_norm": 0.797931969165802, "learning_rate": 0.0002, "epoch": 4.564811355995517, "step": 6110}, {"loss": 1.4023, "grad_norm": 0.7056134343147278, "learning_rate": 0.0002, "epoch": 4.572282405677997, "step": 6120}, {"loss": 1.3814, "grad_norm": 0.7850478887557983, "learning_rate": 0.0002, "epoch": 4.579753455360478, "step": 6130}, {"loss": 1.3579, "grad_norm": 0.8112621307373047, "learning_rate": 0.0002, "epoch": 4.5872245050429585, "step": 6140}, {"loss": 1.3523, "grad_norm": 0.7040849328041077, "learning_rate": 0.0002, "epoch": 4.594695554725439, "step": 6150}, {"loss": 1.3526, "grad_norm": 0.7214553952217102, "learning_rate": 0.0002, "epoch": 4.60216660440792, "step": 6160}, {"loss": 1.3932, "grad_norm": 0.8616511821746826, "learning_rate": 0.0002, "epoch": 4.6096376540904, "step": 6170}, {"loss": 1.4622, "grad_norm": 0.8374658226966858, "learning_rate": 0.0002, "epoch": 4.61710870377288, "step": 6180}, {"loss": 1.3703, "grad_norm": 0.6761606931686401, "learning_rate": 0.0002, "epoch": 4.62457975345536, "step": 6190}, {"loss": 1.3977, "grad_norm": 0.768028199672699, "learning_rate": 0.0002, "epoch": 4.632050803137841, "step": 6200}, {"loss": 1.3772, "grad_norm": 0.9372717142105103, "learning_rate": 0.0002, "epoch": 4.639521852820321, "step": 6210}, {"loss": 1.4098, "grad_norm": 0.7906546592712402, "learning_rate": 0.0002, "epoch": 4.646992902502801, "step": 6220}, {"loss": 1.3962, "grad_norm": 0.7376723289489746, "learning_rate": 0.0002, "epoch": 4.654463952185282, "step": 6230}, {"loss": 1.4529, "grad_norm": 0.8972630500793457, "learning_rate": 0.0002, "epoch": 4.6619350018677626, "step": 6240}, {"loss": 1.4668, "grad_norm": 0.8261756300926208, "learning_rate": 0.0002, "epoch": 4.669406051550243, "step": 6250}, {"loss": 1.3267, "grad_norm": 0.7512393593788147, "learning_rate": 0.0002, "epoch": 4.676877101232723, "step": 6260}, {"loss": 1.4278, "grad_norm": 0.7132362127304077, "learning_rate": 0.0002, "epoch": 4.684348150915204, "step": 6270}, {"loss": 1.4299, "grad_norm": 0.7690575122833252, "learning_rate": 0.0002, "epoch": 4.691819200597684, "step": 6280}, {"loss": 1.4769, "grad_norm": 0.9886258840560913, "learning_rate": 0.0002, "epoch": 4.699290250280164, "step": 6290}, {"loss": 1.4005, "grad_norm": 0.9502435922622681, "learning_rate": 0.0002, "epoch": 4.706761299962645, "step": 6300}, {"loss": 1.4319, "grad_norm": 0.702255129814148, "learning_rate": 0.0002, "epoch": 4.714232349645125, "step": 6310}, {"loss": 1.4447, "grad_norm": 0.7713103890419006, "learning_rate": 0.0002, "epoch": 4.721703399327605, "step": 6320}, {"loss": 1.4392, "grad_norm": 0.7778580784797668, "learning_rate": 0.0002, "epoch": 4.7291744490100855, "step": 6330}, {"loss": 1.4169, "grad_norm": 0.7275111079216003, "learning_rate": 0.0002, "epoch": 4.736645498692567, "step": 6340}, {"loss": 1.4429, "grad_norm": 0.7728744149208069, "learning_rate": 0.0002, "epoch": 4.744116548375047, "step": 6350}, {"loss": 1.3756, "grad_norm": 0.9724260568618774, "learning_rate": 0.0002, "epoch": 4.751587598057527, "step": 6360}, {"loss": 1.3358, "grad_norm": 0.7505622506141663, "learning_rate": 0.0002, "epoch": 4.759058647740007, "step": 6370}, {"loss": 1.379, "grad_norm": 0.7994682788848877, "learning_rate": 0.0002, "epoch": 4.766529697422488, "step": 6380}, {"loss": 1.4275, "grad_norm": 0.8432038426399231, "learning_rate": 0.0002, "epoch": 4.774000747104968, "step": 6390}, {"loss": 1.4606, "grad_norm": 0.7436022758483887, "learning_rate": 0.0002, "epoch": 4.781471796787448, "step": 6400}, {"loss": 1.3461, "grad_norm": 0.7709194421768188, "learning_rate": 0.0002, "epoch": 4.788942846469929, "step": 6410}, {"loss": 1.3715, "grad_norm": 0.8798436522483826, "learning_rate": 0.0002, "epoch": 4.796413896152409, "step": 6420}, {"loss": 1.3761, "grad_norm": 0.790189266204834, "learning_rate": 0.0002, "epoch": 4.80388494583489, "step": 6430}, {"loss": 1.4109, "grad_norm": 0.6824303865432739, "learning_rate": 0.0002, "epoch": 4.811355995517371, "step": 6440}, {"loss": 1.3877, "grad_norm": 0.7501044869422913, "learning_rate": 0.0002, "epoch": 4.818827045199851, "step": 6450}, {"loss": 1.4458, "grad_norm": 0.8840398192405701, "learning_rate": 0.0002, "epoch": 4.826298094882331, "step": 6460}, {"loss": 1.4412, "grad_norm": 0.7812688946723938, "learning_rate": 0.0002, "epoch": 4.833769144564811, "step": 6470}, {"loss": 1.4299, "grad_norm": 0.7429926991462708, "learning_rate": 0.0002, "epoch": 4.841240194247292, "step": 6480}, {"loss": 1.5062, "grad_norm": 0.7778021693229675, "learning_rate": 0.0002, "epoch": 4.848711243929772, "step": 6490}, {"loss": 1.4589, "grad_norm": 0.8270702362060547, "learning_rate": 0.0002, "epoch": 4.856182293612252, "step": 6500}, {"loss": 1.4091, "grad_norm": 0.6960513591766357, "learning_rate": 0.0002, "epoch": 4.863653343294732, "step": 6510}, {"loss": 1.376, "grad_norm": 0.7728942632675171, "learning_rate": 0.0002, "epoch": 4.8711243929772134, "step": 6520}, {"loss": 1.4852, "grad_norm": 0.7377303838729858, "learning_rate": 0.0002, "epoch": 4.878595442659694, "step": 6530}, {"loss": 1.3846, "grad_norm": 0.7257253527641296, "learning_rate": 0.0002, "epoch": 4.886066492342174, "step": 6540}, {"loss": 1.4166, "grad_norm": 0.7875821590423584, "learning_rate": 0.0002, "epoch": 4.893537542024655, "step": 6550}, {"loss": 1.357, "grad_norm": 0.8346304297447205, "learning_rate": 0.0002, "epoch": 4.901008591707135, "step": 6560}, {"loss": 1.4522, "grad_norm": 0.7710739374160767, "learning_rate": 0.0002, "epoch": 4.908479641389615, "step": 6570}, {"loss": 1.4465, "grad_norm": 0.7015138268470764, "learning_rate": 0.0002, "epoch": 4.915950691072096, "step": 6580}, {"loss": 1.435, "grad_norm": 0.8707432150840759, "learning_rate": 0.0002, "epoch": 4.923421740754576, "step": 6590}, {"loss": 1.2968, "grad_norm": 0.786601185798645, "learning_rate": 0.0002, "epoch": 4.930892790437056, "step": 6600}, {"loss": 1.4385, "grad_norm": 0.978519082069397, "learning_rate": 0.0002, "epoch": 4.938363840119536, "step": 6610}, {"loss": 1.3997, "grad_norm": 0.8102927207946777, "learning_rate": 0.0002, "epoch": 4.9458348898020175, "step": 6620}, {"loss": 1.4859, "grad_norm": 0.7628704309463501, "learning_rate": 0.0002, "epoch": 4.953305939484498, "step": 6630}, {"loss": 1.3774, "grad_norm": 0.8053455352783203, "learning_rate": 0.0002, "epoch": 4.960776989166978, "step": 6640}, {"loss": 1.5092, "grad_norm": 0.8680412173271179, "learning_rate": 0.0002, "epoch": 4.968248038849458, "step": 6650}, {"loss": 1.3978, "grad_norm": 0.7415758371353149, "learning_rate": 0.0002, "epoch": 4.975719088531939, "step": 6660}, {"loss": 1.3793, "grad_norm": 0.7730312347412109, "learning_rate": 0.0002, "epoch": 4.983190138214419, "step": 6670}, {"loss": 1.4863, "grad_norm": 0.7924041152000427, "learning_rate": 0.0002, "epoch": 4.990661187896899, "step": 6680}, {"loss": 1.4137, "grad_norm": 0.8677893877029419, "learning_rate": 0.0002, "epoch": 4.99813223757938, "step": 6690}, {"eval_loss": 1.9444633722305298, "eval_runtime": 39.3488, "eval_samples_per_second": 13.088, "eval_steps_per_second": 1.652, "epoch": 4.999626447515876, "step": 6692}, {"loss": 1.3076, "grad_norm": 0.7102245092391968, "learning_rate": 0.0002, "epoch": 5.00560328726186, "step": 6700}, {"loss": 1.2714, "grad_norm": 1.0425463914871216, "learning_rate": 0.0002, "epoch": 5.0130743369443405, "step": 6710}, {"loss": 1.181, "grad_norm": 0.9320756793022156, "learning_rate": 0.0002, "epoch": 5.0205453866268215, "step": 6720}, {"loss": 1.1786, "grad_norm": 0.8797217607498169, "learning_rate": 0.0002, "epoch": 5.028016436309302, "step": 6730}, {"loss": 1.2097, "grad_norm": 2.135707139968872, "learning_rate": 0.0002, "epoch": 5.035487485991782, "step": 6740}, {"loss": 1.1761, "grad_norm": 0.8747734427452087, "learning_rate": 0.0002, "epoch": 5.042958535674262, "step": 6750}, {"loss": 1.1675, "grad_norm": 0.9981076717376709, "learning_rate": 0.0002, "epoch": 5.050429585356743, "step": 6760}, {"loss": 1.1976, "grad_norm": 0.985078752040863, "learning_rate": 0.0002, "epoch": 5.057900635039223, "step": 6770}, {"loss": 1.2688, "grad_norm": 1.0974019765853882, "learning_rate": 0.0002, "epoch": 5.065371684721703, "step": 6780}, {"loss": 1.1982, "grad_norm": 0.9823219180107117, "learning_rate": 0.0002, "epoch": 5.072842734404184, "step": 6790}, {"loss": 1.2586, "grad_norm": 1.122605562210083, "learning_rate": 0.0002, "epoch": 5.080313784086664, "step": 6800}, {"loss": 1.2069, "grad_norm": 0.8556802272796631, "learning_rate": 0.0002, "epoch": 5.0877848337691445, "step": 6810}, {"loss": 1.1908, "grad_norm": 1.1699262857437134, "learning_rate": 0.0002, "epoch": 5.095255883451625, "step": 6820}, {"loss": 1.1869, "grad_norm": 1.0440590381622314, "learning_rate": 0.0002, "epoch": 5.102726933134106, "step": 6830}, {"loss": 1.1655, "grad_norm": 1.0445300340652466, "learning_rate": 0.0002, "epoch": 5.110197982816586, "step": 6840}, {"loss": 1.2392, "grad_norm": 0.8289563059806824, "learning_rate": 0.0002, "epoch": 5.117669032499066, "step": 6850}, {"loss": 1.1687, "grad_norm": 1.1051193475723267, "learning_rate": 0.0002, "epoch": 5.125140082181547, "step": 6860}, {"loss": 1.2737, "grad_norm": 0.9345614910125732, "learning_rate": 0.0002, "epoch": 5.132611131864027, "step": 6870}, {"loss": 1.3021, "grad_norm": 1.1222996711730957, "learning_rate": 0.0002, "epoch": 5.140082181546507, "step": 6880}, {"loss": 1.2408, "grad_norm": 0.9405338764190674, "learning_rate": 0.0002, "epoch": 5.147553231228987, "step": 6890}, {"loss": 1.2367, "grad_norm": 1.0935171842575073, "learning_rate": 0.0002, "epoch": 5.155024280911468, "step": 6900}, {"loss": 1.2458, "grad_norm": 1.0438612699508667, "learning_rate": 0.0002, "epoch": 5.1624953305939485, "step": 6910}, {"loss": 1.2562, "grad_norm": 1.1189004182815552, "learning_rate": 0.0002, "epoch": 5.169966380276429, "step": 6920}, {"loss": 1.25, "grad_norm": 1.0533215999603271, "learning_rate": 0.0002, "epoch": 5.17743742995891, "step": 6930}, {"loss": 1.2974, "grad_norm": 0.9779648780822754, "learning_rate": 0.0002, "epoch": 5.18490847964139, "step": 6940}, {"loss": 1.1965, "grad_norm": 0.8920868635177612, "learning_rate": 0.0002, "epoch": 5.19237952932387, "step": 6950}, {"loss": 1.283, "grad_norm": 0.8374548554420471, "learning_rate": 0.0002, "epoch": 5.19985057900635, "step": 6960}, {"loss": 1.2775, "grad_norm": 1.0490682125091553, "learning_rate": 0.0002, "epoch": 5.207321628688831, "step": 6970}, {"loss": 1.1826, "grad_norm": 0.9658287167549133, "learning_rate": 0.0002, "epoch": 5.214792678371311, "step": 6980}, {"loss": 1.2647, "grad_norm": 0.9652056097984314, "learning_rate": 0.0002, "epoch": 5.222263728053791, "step": 6990}, {"loss": 1.3023, "grad_norm": 0.9141794443130493, "learning_rate": 0.0002, "epoch": 5.229734777736272, "step": 7000}, {"loss": 1.2456, "grad_norm": 0.9831376671791077, "learning_rate": 0.0002, "epoch": 5.2372058274187525, "step": 7010}, {"loss": 1.2176, "grad_norm": 1.0198718309402466, "learning_rate": 0.0002, "epoch": 5.244676877101233, "step": 7020}, {"loss": 1.2643, "grad_norm": 0.9647888541221619, "learning_rate": 0.0002, "epoch": 5.252147926783713, "step": 7030}, {"loss": 1.2106, "grad_norm": 1.3941649198532104, "learning_rate": 0.0002, "epoch": 5.259618976466194, "step": 7040}, {"loss": 1.2885, "grad_norm": 1.0305466651916504, "learning_rate": 0.0002, "epoch": 5.267090026148674, "step": 7050}, {"loss": 1.2362, "grad_norm": 0.9577859044075012, "learning_rate": 0.0002, "epoch": 5.274561075831154, "step": 7060}, {"loss": 1.2231, "grad_norm": 1.149092197418213, "learning_rate": 0.0002, "epoch": 5.282032125513634, "step": 7070}, {"loss": 1.2986, "grad_norm": 1.2582733631134033, "learning_rate": 0.0002, "epoch": 5.289503175196115, "step": 7080}, {"loss": 1.2307, "grad_norm": 1.1777442693710327, "learning_rate": 0.0002, "epoch": 5.296974224878595, "step": 7090}, {"loss": 1.24, "grad_norm": 1.0076404809951782, "learning_rate": 0.0002, "epoch": 5.3044452745610755, "step": 7100}, {"loss": 1.1407, "grad_norm": 0.9037365913391113, "learning_rate": 0.0002, "epoch": 5.3119163242435565, "step": 7110}, {"loss": 1.238, "grad_norm": 0.9428724646568298, "learning_rate": 0.0002, "epoch": 5.319387373926037, "step": 7120}, {"loss": 1.2571, "grad_norm": 0.9935154318809509, "learning_rate": 0.0002, "epoch": 5.326858423608517, "step": 7130}, {"loss": 1.2833, "grad_norm": 1.087500810623169, "learning_rate": 0.0002, "epoch": 5.334329473290998, "step": 7140}, {"loss": 1.2304, "grad_norm": 0.8543072938919067, "learning_rate": 0.0002, "epoch": 5.341800522973478, "step": 7150}, {"loss": 1.2755, "grad_norm": 0.9323700070381165, "learning_rate": 0.0002, "epoch": 5.349271572655958, "step": 7160}, {"loss": 1.2769, "grad_norm": 1.0037827491760254, "learning_rate": 0.0002, "epoch": 5.356742622338438, "step": 7170}, {"loss": 1.3204, "grad_norm": 0.8746469616889954, "learning_rate": 0.0002, "epoch": 5.364213672020919, "step": 7180}, {"loss": 1.2759, "grad_norm": 0.9516328573226929, "learning_rate": 0.0002, "epoch": 5.371684721703399, "step": 7190}, {"loss": 1.2428, "grad_norm": 0.9395177364349365, "learning_rate": 0.0002, "epoch": 5.3791557713858795, "step": 7200}, {"loss": 1.3214, "grad_norm": 1.000369906425476, "learning_rate": 0.0002, "epoch": 5.38662682106836, "step": 7210}, {"loss": 1.2337, "grad_norm": 1.0845502614974976, "learning_rate": 0.0002, "epoch": 5.394097870750841, "step": 7220}, {"loss": 1.2776, "grad_norm": 0.8975145220756531, "learning_rate": 0.0002, "epoch": 5.401568920433321, "step": 7230}, {"loss": 1.2306, "grad_norm": 1.040077805519104, "learning_rate": 0.0002, "epoch": 5.409039970115801, "step": 7240}, {"loss": 1.2277, "grad_norm": 1.0729942321777344, "learning_rate": 0.0002, "epoch": 5.416511019798282, "step": 7250}, {"loss": 1.2714, "grad_norm": 0.8322232961654663, "learning_rate": 0.0002, "epoch": 5.423982069480762, "step": 7260}, {"loss": 1.3036, "grad_norm": 1.0654641389846802, "learning_rate": 0.0002, "epoch": 5.431453119163242, "step": 7270}, {"loss": 1.268, "grad_norm": 1.0445852279663086, "learning_rate": 0.0002, "epoch": 5.438924168845723, "step": 7280}, {"loss": 1.2743, "grad_norm": 1.0762956142425537, "learning_rate": 0.0002, "epoch": 5.446395218528203, "step": 7290}, {"loss": 1.2887, "grad_norm": 0.9721953868865967, "learning_rate": 0.0002, "epoch": 5.4538662682106835, "step": 7300}, {"loss": 1.2833, "grad_norm": 0.9238539338111877, "learning_rate": 0.0002, "epoch": 5.461337317893164, "step": 7310}, {"loss": 1.255, "grad_norm": 0.9912874102592468, "learning_rate": 0.0002, "epoch": 5.468808367575645, "step": 7320}, {"loss": 1.2557, "grad_norm": 1.0727077722549438, "learning_rate": 0.0002, "epoch": 5.476279417258125, "step": 7330}, {"loss": 1.3471, "grad_norm": 0.8633865118026733, "learning_rate": 0.0002, "epoch": 5.483750466940605, "step": 7340}, {"loss": 1.3155, "grad_norm": 0.9396262764930725, "learning_rate": 0.0002, "epoch": 5.491221516623085, "step": 7350}, {"loss": 1.3146, "grad_norm": 1.0253715515136719, "learning_rate": 0.0002, "epoch": 5.498692566305566, "step": 7360}, {"loss": 1.3156, "grad_norm": 1.006047010421753, "learning_rate": 0.0002, "epoch": 5.506163615988046, "step": 7370}, {"loss": 1.3107, "grad_norm": 0.9781233072280884, "learning_rate": 0.0002, "epoch": 5.513634665670526, "step": 7380}, {"loss": 1.2703, "grad_norm": 0.9945126175880432, "learning_rate": 0.0002, "epoch": 5.521105715353007, "step": 7390}, {"loss": 1.1936, "grad_norm": 0.9081175327301025, "learning_rate": 0.0002, "epoch": 5.528576765035488, "step": 7400}, {"loss": 1.2651, "grad_norm": 1.2215938568115234, "learning_rate": 0.0002, "epoch": 5.536047814717968, "step": 7410}, {"loss": 1.2484, "grad_norm": 1.0724077224731445, "learning_rate": 0.0002, "epoch": 5.543518864400449, "step": 7420}, {"loss": 1.3083, "grad_norm": 1.106955885887146, "learning_rate": 0.0002, "epoch": 5.550989914082929, "step": 7430}, {"loss": 1.2125, "grad_norm": 1.0657650232315063, "learning_rate": 0.0002, "epoch": 5.558460963765409, "step": 7440}, {"loss": 1.2576, "grad_norm": 0.9725455641746521, "learning_rate": 0.0002, "epoch": 5.565932013447889, "step": 7450}, {"loss": 1.3297, "grad_norm": 0.8604224324226379, "learning_rate": 0.0002, "epoch": 5.57340306313037, "step": 7460}, {"loss": 1.3084, "grad_norm": 0.9913371205329895, "learning_rate": 0.0002, "epoch": 5.58087411281285, "step": 7470}, {"loss": 1.3371, "grad_norm": 1.012073040008545, "learning_rate": 0.0002, "epoch": 5.58834516249533, "step": 7480}, {"loss": 1.2526, "grad_norm": 1.1003159284591675, "learning_rate": 0.0002, "epoch": 5.5958162121778106, "step": 7490}, {"loss": 1.2577, "grad_norm": 0.9104593992233276, "learning_rate": 0.0002, "epoch": 5.603287261860292, "step": 7500}, {"loss": 1.2578, "grad_norm": 0.9480831623077393, "learning_rate": 0.0002, "epoch": 5.610758311542772, "step": 7510}, {"loss": 1.3056, "grad_norm": 1.0826456546783447, "learning_rate": 0.0002, "epoch": 5.618229361225252, "step": 7520}, {"loss": 1.2931, "grad_norm": 0.8286259174346924, "learning_rate": 0.0002, "epoch": 5.625700410907733, "step": 7530}, {"loss": 1.2918, "grad_norm": 0.9145061373710632, "learning_rate": 0.0002, "epoch": 5.633171460590213, "step": 7540}, {"loss": 1.1736, "grad_norm": 0.9363601803779602, "learning_rate": 0.0002, "epoch": 5.640642510272693, "step": 7550}, {"loss": 1.2265, "grad_norm": 0.9553244709968567, "learning_rate": 0.0002, "epoch": 5.648113559955174, "step": 7560}, {"loss": 1.2356, "grad_norm": 1.0343557596206665, "learning_rate": 0.0002, "epoch": 5.655584609637654, "step": 7570}, {"loss": 1.3171, "grad_norm": 0.8734238743782043, "learning_rate": 0.0002, "epoch": 5.663055659320134, "step": 7580}, {"loss": 1.2785, "grad_norm": 1.0230586528778076, "learning_rate": 0.0002, "epoch": 5.670526709002615, "step": 7590}, {"loss": 1.2936, "grad_norm": 1.0063409805297852, "learning_rate": 0.0002, "epoch": 5.677997758685096, "step": 7600}, {"loss": 1.2396, "grad_norm": 1.0104626417160034, "learning_rate": 0.0002, "epoch": 5.685468808367576, "step": 7610}, {"loss": 1.2581, "grad_norm": 0.9528168439865112, "learning_rate": 0.0002, "epoch": 5.692939858050056, "step": 7620}, {"loss": 1.3116, "grad_norm": 0.9799878597259521, "learning_rate": 0.0002, "epoch": 5.700410907732536, "step": 7630}, {"loss": 1.2632, "grad_norm": 0.969351589679718, "learning_rate": 0.0002, "epoch": 5.707881957415017, "step": 7640}, {"loss": 1.3055, "grad_norm": 1.3037652969360352, "learning_rate": 0.0002, "epoch": 5.715353007097497, "step": 7650}, {"loss": 1.3126, "grad_norm": 1.0640486478805542, "learning_rate": 0.0002, "epoch": 5.722824056779977, "step": 7660}, {"loss": 1.3325, "grad_norm": 1.0416420698165894, "learning_rate": 0.0002, "epoch": 5.730295106462458, "step": 7670}, {"loss": 1.25, "grad_norm": 0.8893619775772095, "learning_rate": 0.0002, "epoch": 5.7377661561449385, "step": 7680}, {"loss": 1.319, "grad_norm": 0.8512844443321228, "learning_rate": 0.0002, "epoch": 5.745237205827419, "step": 7690}, {"loss": 1.3328, "grad_norm": 0.9955748319625854, "learning_rate": 0.0002, "epoch": 5.7527082555099, "step": 7700}, {"loss": 1.294, "grad_norm": 1.0409910678863525, "learning_rate": 0.0002, "epoch": 5.76017930519238, "step": 7710}, {"loss": 1.3518, "grad_norm": 1.010097861289978, "learning_rate": 0.0002, "epoch": 5.76765035487486, "step": 7720}, {"loss": 1.2106, "grad_norm": 0.8974892497062683, "learning_rate": 0.0002, "epoch": 5.77512140455734, "step": 7730}, {"loss": 1.2743, "grad_norm": 0.972835123538971, "learning_rate": 0.0002, "epoch": 5.782592454239821, "step": 7740}, {"loss": 1.3549, "grad_norm": 0.9607440829277039, "learning_rate": 0.0002, "epoch": 5.790063503922301, "step": 7750}, {"loss": 1.29, "grad_norm": 0.9426500797271729, "learning_rate": 0.0002, "epoch": 5.797534553604781, "step": 7760}, {"loss": 1.274, "grad_norm": 0.8745320439338684, "learning_rate": 0.0002, "epoch": 5.8050056032872615, "step": 7770}, {"loss": 1.3009, "grad_norm": 1.0117204189300537, "learning_rate": 0.0002, "epoch": 5.8124766529697425, "step": 7780}, {"loss": 1.3135, "grad_norm": 1.0387755632400513, "learning_rate": 0.0002, "epoch": 5.819947702652223, "step": 7790}, {"loss": 1.2709, "grad_norm": 1.0709784030914307, "learning_rate": 0.0002, "epoch": 5.827418752334703, "step": 7800}, {"loss": 1.225, "grad_norm": 0.9512667655944824, "learning_rate": 0.0002, "epoch": 5.834889802017184, "step": 7810}, {"loss": 1.3284, "grad_norm": 1.021094560623169, "learning_rate": 0.0002, "epoch": 5.842360851699664, "step": 7820}, {"loss": 1.2794, "grad_norm": 1.117491364479065, "learning_rate": 0.0002, "epoch": 5.849831901382144, "step": 7830}, {"loss": 1.3646, "grad_norm": 0.9252554178237915, "learning_rate": 0.0002, "epoch": 5.857302951064625, "step": 7840}, {"loss": 1.2976, "grad_norm": 1.1416207551956177, "learning_rate": 0.0002, "epoch": 5.864774000747105, "step": 7850}, {"loss": 1.3293, "grad_norm": 1.1219907999038696, "learning_rate": 0.0002, "epoch": 5.872245050429585, "step": 7860}, {"loss": 1.2334, "grad_norm": 0.8300467729568481, "learning_rate": 0.0002, "epoch": 5.8797161001120655, "step": 7870}, {"loss": 1.3132, "grad_norm": 1.00551438331604, "learning_rate": 0.0002, "epoch": 5.8871871497945465, "step": 7880}, {"loss": 1.2609, "grad_norm": 0.8981153964996338, "learning_rate": 0.0002, "epoch": 5.894658199477027, "step": 7890}, {"loss": 1.2817, "grad_norm": 1.0247976779937744, "learning_rate": 0.0002, "epoch": 5.902129249159507, "step": 7900}, {"loss": 1.2866, "grad_norm": 1.0820319652557373, "learning_rate": 0.0002, "epoch": 5.909600298841987, "step": 7910}, {"loss": 1.2941, "grad_norm": 0.952675461769104, "learning_rate": 0.0002, "epoch": 5.917071348524468, "step": 7920}, {"loss": 1.307, "grad_norm": 0.8666740655899048, "learning_rate": 0.0002, "epoch": 5.924542398206948, "step": 7930}, {"loss": 1.2752, "grad_norm": 0.8640421032905579, "learning_rate": 0.0002, "epoch": 5.932013447889428, "step": 7940}, {"loss": 1.2386, "grad_norm": 1.2343276739120483, "learning_rate": 0.0002, "epoch": 5.939484497571909, "step": 7950}, {"loss": 1.2333, "grad_norm": 0.958046555519104, "learning_rate": 0.0002, "epoch": 5.946955547254389, "step": 7960}, {"loss": 1.2352, "grad_norm": 1.0538510084152222, "learning_rate": 0.0002, "epoch": 5.9544265969368695, "step": 7970}, {"loss": 1.3233, "grad_norm": 1.2681571245193481, "learning_rate": 0.0002, "epoch": 5.9618976466193505, "step": 7980}, {"loss": 1.2514, "grad_norm": 0.8171183466911316, "learning_rate": 0.0002, "epoch": 5.969368696301831, "step": 7990}, {"loss": 1.3412, "grad_norm": 0.9109523892402649, "learning_rate": 0.0002, "epoch": 5.976839745984311, "step": 8000}, {"loss": 1.3497, "grad_norm": 1.0040639638900757, "learning_rate": 0.0002, "epoch": 5.984310795666791, "step": 8010}, {"loss": 1.3299, "grad_norm": 0.9596554040908813, "learning_rate": 0.0002, "epoch": 5.991781845349272, "step": 8020}, {"loss": 1.3109, "grad_norm": 0.9782963991165161, "learning_rate": 0.0002, "epoch": 5.999252895031752, "step": 8030}, {"eval_loss": 2.0417845249176025, "eval_runtime": 38.8465, "eval_samples_per_second": 13.257, "eval_steps_per_second": 1.673, "epoch": 6.0, "step": 8031}, {"loss": 1.0886, "grad_norm": 1.380823016166687, "learning_rate": 0.0002, "epoch": 6.006723944714232, "step": 8040}, {"loss": 1.0413, "grad_norm": 1.067636251449585, "learning_rate": 0.0002, "epoch": 6.014194994396712, "step": 8050}, {"loss": 1.0686, "grad_norm": 1.363402009010315, "learning_rate": 0.0002, "epoch": 6.021666044079193, "step": 8060}, {"loss": 1.0762, "grad_norm": 0.9901054501533508, "learning_rate": 0.0002, "epoch": 6.0291370937616735, "step": 8070}, {"loss": 1.1182, "grad_norm": 1.1545379161834717, "learning_rate": 0.0002, "epoch": 6.036608143444154, "step": 8080}, {"loss": 1.0644, "grad_norm": 1.2259265184402466, "learning_rate": 0.0002, "epoch": 6.044079193126635, "step": 8090}, {"loss": 1.1273, "grad_norm": 1.1237425804138184, "learning_rate": 0.0002, "epoch": 6.051550242809115, "step": 8100}, {"loss": 1.1001, "grad_norm": 1.2805622816085815, "learning_rate": 0.0002, "epoch": 6.059021292491595, "step": 8110}, {"loss": 1.0731, "grad_norm": 1.2270452976226807, "learning_rate": 0.0002, "epoch": 6.066492342174075, "step": 8120}, {"loss": 1.0692, "grad_norm": 1.1924101114273071, "learning_rate": 0.0002, "epoch": 6.073963391856556, "step": 8130}, {"loss": 1.1698, "grad_norm": 1.2543894052505493, "learning_rate": 0.0002, "epoch": 6.081434441539036, "step": 8140}, {"loss": 1.069, "grad_norm": 1.1821149587631226, "learning_rate": 0.0002, "epoch": 6.088905491221516, "step": 8150}, {"loss": 1.109, "grad_norm": 1.2202836275100708, "learning_rate": 0.0002, "epoch": 6.096376540903997, "step": 8160}, {"loss": 1.136, "grad_norm": 1.0576019287109375, "learning_rate": 0.0002, "epoch": 6.1038475905864775, "step": 8170}, {"loss": 1.1395, "grad_norm": 1.31708824634552, "learning_rate": 0.0002, "epoch": 6.111318640268958, "step": 8180}, {"loss": 1.0887, "grad_norm": 1.0479495525360107, "learning_rate": 0.0002, "epoch": 6.118789689951438, "step": 8190}, {"loss": 1.0764, "grad_norm": 1.285003423690796, "learning_rate": 0.0002, "epoch": 6.126260739633919, "step": 8200}, {"loss": 1.0642, "grad_norm": 1.0989165306091309, "learning_rate": 0.0002, "epoch": 6.133731789316399, "step": 8210}, {"loss": 1.0981, "grad_norm": 1.1659013032913208, "learning_rate": 0.0002, "epoch": 6.141202838998879, "step": 8220}, {"loss": 1.1138, "grad_norm": 1.2796376943588257, "learning_rate": 0.0002, "epoch": 6.14867388868136, "step": 8230}, {"loss": 1.1116, "grad_norm": 1.060564637184143, "learning_rate": 0.0002, "epoch": 6.15614493836384, "step": 8240}, {"loss": 1.1493, "grad_norm": 1.3884605169296265, "learning_rate": 0.0002, "epoch": 6.16361598804632, "step": 8250}, {"loss": 1.0504, "grad_norm": 1.1570569276809692, "learning_rate": 0.0002, "epoch": 6.1710870377288005, "step": 8260}, {"loss": 1.0386, "grad_norm": 1.4136502742767334, "learning_rate": 0.0002, "epoch": 6.1785580874112815, "step": 8270}, {"loss": 1.0882, "grad_norm": 1.3396095037460327, "learning_rate": 0.0002, "epoch": 6.186029137093762, "step": 8280}, {"loss": 1.133, "grad_norm": 1.2549997568130493, "learning_rate": 0.0002, "epoch": 6.193500186776242, "step": 8290}, {"loss": 1.0626, "grad_norm": 1.3629751205444336, "learning_rate": 0.0002, "epoch": 6.200971236458723, "step": 8300}, {"loss": 1.1343, "grad_norm": 1.1029163599014282, "learning_rate": 0.0002, "epoch": 6.208442286141203, "step": 8310}, {"loss": 1.0895, "grad_norm": 1.1992450952529907, "learning_rate": 0.0002, "epoch": 6.215913335823683, "step": 8320}, {"loss": 1.1417, "grad_norm": 1.3317986726760864, "learning_rate": 0.0002, "epoch": 6.223384385506163, "step": 8330}, {"loss": 1.0958, "grad_norm": 1.0538336038589478, "learning_rate": 0.0002, "epoch": 6.230855435188644, "step": 8340}, {"loss": 1.1557, "grad_norm": 1.1767704486846924, "learning_rate": 0.0002, "epoch": 6.238326484871124, "step": 8350}, {"loss": 1.1038, "grad_norm": 1.1213016510009766, "learning_rate": 0.0002, "epoch": 6.2457975345536045, "step": 8360}, {"loss": 1.1241, "grad_norm": 1.1895716190338135, "learning_rate": 0.0002, "epoch": 6.253268584236086, "step": 8370}, {"loss": 1.1171, "grad_norm": 1.1078153848648071, "learning_rate": 0.0002, "epoch": 6.260739633918566, "step": 8380}, {"loss": 1.1124, "grad_norm": 1.1662801504135132, "learning_rate": 0.0002, "epoch": 6.268210683601046, "step": 8390}, {"loss": 1.125, "grad_norm": 1.2071197032928467, "learning_rate": 0.0002, "epoch": 6.275681733283526, "step": 8400}, {"loss": 1.0625, "grad_norm": 1.2653778791427612, "learning_rate": 0.0002, "epoch": 6.283152782966007, "step": 8410}, {"loss": 1.0565, "grad_norm": 1.6128872632980347, "learning_rate": 0.0002, "epoch": 6.290623832648487, "step": 8420}, {"loss": 1.1212, "grad_norm": 1.4993070363998413, "learning_rate": 0.0002, "epoch": 6.298094882330967, "step": 8430}, {"loss": 1.1516, "grad_norm": 1.16339910030365, "learning_rate": 0.0002, "epoch": 6.305565932013448, "step": 8440}, {"loss": 1.0662, "grad_norm": 1.256822943687439, "learning_rate": 0.0002, "epoch": 6.313036981695928, "step": 8450}, {"loss": 1.1566, "grad_norm": 1.1352964639663696, "learning_rate": 0.0002, "epoch": 6.3205080313784086, "step": 8460}, {"loss": 1.1297, "grad_norm": 1.0061070919036865, "learning_rate": 0.0002, "epoch": 6.327979081060889, "step": 8470}, {"loss": 1.0967, "grad_norm": 1.1901768445968628, "learning_rate": 0.0002, "epoch": 6.33545013074337, "step": 8480}, {"loss": 1.1463, "grad_norm": 1.2715139389038086, "learning_rate": 0.0002, "epoch": 6.34292118042585, "step": 8490}, {"loss": 1.2143, "grad_norm": 1.1583346128463745, "learning_rate": 0.0002, "epoch": 6.35039223010833, "step": 8500}, {"loss": 1.1072, "grad_norm": 1.1427477598190308, "learning_rate": 0.0002, "epoch": 6.357863279790811, "step": 8510}, {"loss": 1.1119, "grad_norm": 1.1952263116836548, "learning_rate": 0.0002, "epoch": 6.365334329473291, "step": 8520}, {"loss": 1.0797, "grad_norm": 1.0599623918533325, "learning_rate": 0.0002, "epoch": 6.372805379155771, "step": 8530}, {"loss": 1.1091, "grad_norm": 1.3511574268341064, "learning_rate": 0.0002, "epoch": 6.380276428838251, "step": 8540}, {"loss": 1.1272, "grad_norm": 1.171126127243042, "learning_rate": 0.0002, "epoch": 6.387747478520732, "step": 8550}, {"loss": 1.1615, "grad_norm": 1.285474419593811, "learning_rate": 0.0002, "epoch": 6.395218528203213, "step": 8560}, {"loss": 1.1505, "grad_norm": 0.9751279950141907, "learning_rate": 0.0002, "epoch": 6.402689577885693, "step": 8570}, {"loss": 1.1502, "grad_norm": 1.2194149494171143, "learning_rate": 0.0002, "epoch": 6.410160627568174, "step": 8580}, {"loss": 1.138, "grad_norm": 1.255888819694519, "learning_rate": 0.0002, "epoch": 6.417631677250654, "step": 8590}, {"loss": 1.1308, "grad_norm": 1.1636122465133667, "learning_rate": 0.0002, "epoch": 6.425102726933134, "step": 8600}, {"loss": 1.1398, "grad_norm": 1.0769859552383423, "learning_rate": 0.0002, "epoch": 6.432573776615614, "step": 8610}, {"loss": 1.1183, "grad_norm": 1.151778221130371, "learning_rate": 0.0002, "epoch": 6.440044826298095, "step": 8620}, {"loss": 1.0706, "grad_norm": 1.2749944925308228, "learning_rate": 0.0002, "epoch": 6.447515875980575, "step": 8630}, {"loss": 1.1011, "grad_norm": 1.1925828456878662, "learning_rate": 0.0002, "epoch": 6.454986925663055, "step": 8640}, {"loss": 1.1581, "grad_norm": 1.166107416152954, "learning_rate": 0.0002, "epoch": 6.4624579753455365, "step": 8650}, {"loss": 1.105, "grad_norm": 1.0372248888015747, "learning_rate": 0.0002, "epoch": 6.469929025028017, "step": 8660}, {"loss": 1.1546, "grad_norm": 1.26933753490448, "learning_rate": 0.0002, "epoch": 6.477400074710497, "step": 8670}, {"loss": 1.2362, "grad_norm": 1.2154223918914795, "learning_rate": 0.0002, "epoch": 6.484871124392977, "step": 8680}, {"loss": 1.1096, "grad_norm": 1.09475839138031, "learning_rate": 0.0002, "epoch": 6.492342174075458, "step": 8690}, {"loss": 1.1168, "grad_norm": 1.0763037204742432, "learning_rate": 0.0002, "epoch": 6.499813223757938, "step": 8700}, {"loss": 1.1993, "grad_norm": 1.1882896423339844, "learning_rate": 0.0002, "epoch": 6.507284273440418, "step": 8710}, {"loss": 1.1498, "grad_norm": 1.1662089824676514, "learning_rate": 0.0002, "epoch": 6.514755323122898, "step": 8720}, {"loss": 1.2008, "grad_norm": 1.3259495496749878, "learning_rate": 0.0002, "epoch": 6.522226372805379, "step": 8730}, {"loss": 1.1289, "grad_norm": 1.0858017206192017, "learning_rate": 0.0002, "epoch": 6.5296974224878594, "step": 8740}, {"loss": 1.1335, "grad_norm": 1.240337610244751, "learning_rate": 0.0002, "epoch": 6.53716847217034, "step": 8750}, {"loss": 1.1479, "grad_norm": 1.1381462812423706, "learning_rate": 0.0002, "epoch": 6.544639521852821, "step": 8760}, {"loss": 1.0991, "grad_norm": 1.2220063209533691, "learning_rate": 0.0002, "epoch": 6.552110571535301, "step": 8770}, {"loss": 1.159, "grad_norm": 1.1553083658218384, "learning_rate": 0.0002, "epoch": 6.559581621217781, "step": 8780}, {"loss": 1.0996, "grad_norm": 1.1383219957351685, "learning_rate": 0.0002, "epoch": 6.567052670900262, "step": 8790}, {"loss": 1.1355, "grad_norm": 1.0379676818847656, "learning_rate": 0.0002, "epoch": 6.574523720582742, "step": 8800}, {"loss": 1.1704, "grad_norm": 1.376488447189331, "learning_rate": 0.0002, "epoch": 6.581994770265222, "step": 8810}, {"loss": 1.1265, "grad_norm": 1.1586211919784546, "learning_rate": 0.0002, "epoch": 6.589465819947702, "step": 8820}, {"loss": 1.1904, "grad_norm": 1.28152334690094, "learning_rate": 0.0002, "epoch": 6.596936869630183, "step": 8830}, {"loss": 1.1646, "grad_norm": 1.2656810283660889, "learning_rate": 0.0002, "epoch": 6.6044079193126635, "step": 8840}, {"loss": 1.1865, "grad_norm": 1.0636502504348755, "learning_rate": 0.0002, "epoch": 6.611878968995144, "step": 8850}, {"loss": 1.125, "grad_norm": 1.273239254951477, "learning_rate": 0.0002, "epoch": 6.619350018677624, "step": 8860}, {"loss": 1.1443, "grad_norm": 1.1055482625961304, "learning_rate": 0.0002, "epoch": 6.626821068360105, "step": 8870}, {"loss": 1.0877, "grad_norm": 1.1934176683425903, "learning_rate": 0.0002, "epoch": 6.634292118042585, "step": 8880}, {"loss": 1.194, "grad_norm": 1.2248114347457886, "learning_rate": 0.0002, "epoch": 6.641763167725065, "step": 8890}, {"loss": 1.1609, "grad_norm": 1.1950982809066772, "learning_rate": 0.0002, "epoch": 6.649234217407546, "step": 8900}, {"loss": 1.169, "grad_norm": 1.0821784734725952, "learning_rate": 0.0002, "epoch": 6.656705267090026, "step": 8910}, {"loss": 1.1337, "grad_norm": 1.0062463283538818, "learning_rate": 0.0002, "epoch": 6.664176316772506, "step": 8920}, {"loss": 1.1403, "grad_norm": 1.2373089790344238, "learning_rate": 0.0002, "epoch": 6.671647366454987, "step": 8930}, {"loss": 1.2051, "grad_norm": 1.1821746826171875, "learning_rate": 0.0002, "epoch": 6.6791184161374675, "step": 8940}, {"loss": 1.1214, "grad_norm": 1.2350659370422363, "learning_rate": 0.0002, "epoch": 6.686589465819948, "step": 8950}, {"loss": 1.225, "grad_norm": 1.1012883186340332, "learning_rate": 0.0002, "epoch": 6.694060515502428, "step": 8960}, {"loss": 1.2111, "grad_norm": 1.2008943557739258, "learning_rate": 0.0002, "epoch": 6.701531565184909, "step": 8970}, {"loss": 1.1769, "grad_norm": 1.2355504035949707, "learning_rate": 0.0002, "epoch": 6.709002614867389, "step": 8980}, {"loss": 1.1323, "grad_norm": 1.2367502450942993, "learning_rate": 0.0002, "epoch": 6.716473664549869, "step": 8990}, {"loss": 1.1235, "grad_norm": 1.1075866222381592, "learning_rate": 0.0002, "epoch": 6.723944714232349, "step": 9000}, {"loss": 1.1239, "grad_norm": 1.246480941772461, "learning_rate": 0.0002, "epoch": 6.73141576391483, "step": 9010}, {"loss": 1.2154, "grad_norm": 1.1252824068069458, "learning_rate": 0.0002, "epoch": 6.73888681359731, "step": 9020}, {"loss": 1.1762, "grad_norm": 1.0706887245178223, "learning_rate": 0.0002, "epoch": 6.7463578632797905, "step": 9030}, {"loss": 1.1961, "grad_norm": 1.0874755382537842, "learning_rate": 0.0002, "epoch": 6.7538289129622715, "step": 9040}, {"loss": 1.0889, "grad_norm": 1.121434211730957, "learning_rate": 0.0002, "epoch": 6.761299962644752, "step": 9050}, {"loss": 1.2018, "grad_norm": 1.1517996788024902, "learning_rate": 0.0002, "epoch": 6.768771012327232, "step": 9060}, {"loss": 1.1593, "grad_norm": 1.2484540939331055, "learning_rate": 0.0002, "epoch": 6.776242062009713, "step": 9070}, {"loss": 1.13, "grad_norm": 1.023059368133545, "learning_rate": 0.0002, "epoch": 6.783713111692193, "step": 9080}, {"loss": 1.1929, "grad_norm": 1.1334631443023682, "learning_rate": 0.0002, "epoch": 6.791184161374673, "step": 9090}, {"loss": 1.18, "grad_norm": 1.2991816997528076, "learning_rate": 0.0002, "epoch": 6.798655211057153, "step": 9100}, {"loss": 1.2398, "grad_norm": 1.4147199392318726, "learning_rate": 0.0002, "epoch": 6.806126260739634, "step": 9110}, {"loss": 1.0958, "grad_norm": 1.1353832483291626, "learning_rate": 0.0002, "epoch": 6.813597310422114, "step": 9120}, {"loss": 1.1379, "grad_norm": 1.0332539081573486, "learning_rate": 0.0002, "epoch": 6.8210683601045945, "step": 9130}, {"loss": 1.1652, "grad_norm": 1.2208142280578613, "learning_rate": 0.0002, "epoch": 6.828539409787075, "step": 9140}, {"loss": 1.1463, "grad_norm": 1.3033398389816284, "learning_rate": 0.0002, "epoch": 6.836010459469556, "step": 9150}, {"loss": 1.1834, "grad_norm": 1.2676737308502197, "learning_rate": 0.0002, "epoch": 6.843481509152036, "step": 9160}, {"loss": 1.1786, "grad_norm": 1.1668603420257568, "learning_rate": 0.0002, "epoch": 6.850952558834516, "step": 9170}, {"loss": 1.1801, "grad_norm": 1.1994788646697998, "learning_rate": 0.0002, "epoch": 6.858423608516997, "step": 9180}, {"loss": 1.2131, "grad_norm": 1.231873869895935, "learning_rate": 0.0002, "epoch": 6.865894658199477, "step": 9190}, {"loss": 1.2109, "grad_norm": 0.9981484413146973, "learning_rate": 0.0002, "epoch": 6.873365707881957, "step": 9200}, {"loss": 1.1084, "grad_norm": 1.2799428701400757, "learning_rate": 0.0002, "epoch": 6.880836757564438, "step": 9210}, {"loss": 1.2004, "grad_norm": 1.2042057514190674, "learning_rate": 0.0002, "epoch": 6.888307807246918, "step": 9220}, {"loss": 1.1567, "grad_norm": 1.070420265197754, "learning_rate": 0.0002, "epoch": 6.8957788569293985, "step": 9230}, {"loss": 1.1353, "grad_norm": 1.327160358428955, "learning_rate": 0.0002, "epoch": 6.903249906611879, "step": 9240}, {"loss": 1.1945, "grad_norm": 1.1109007596969604, "learning_rate": 0.0002, "epoch": 6.91072095629436, "step": 9250}, {"loss": 1.1701, "grad_norm": 1.1669930219650269, "learning_rate": 0.0002, "epoch": 6.91819200597684, "step": 9260}, {"loss": 1.1854, "grad_norm": 1.034532904624939, "learning_rate": 0.0002, "epoch": 6.92566305565932, "step": 9270}, {"loss": 1.1712, "grad_norm": 1.1035540103912354, "learning_rate": 0.0002, "epoch": 6.9331341053418, "step": 9280}, {"loss": 1.1767, "grad_norm": 1.366254448890686, "learning_rate": 0.0002, "epoch": 6.940605155024281, "step": 9290}, {"loss": 1.1591, "grad_norm": 1.094214677810669, "learning_rate": 0.0002, "epoch": 6.948076204706761, "step": 9300}, {"loss": 1.18, "grad_norm": 1.131238579750061, "learning_rate": 0.0002, "epoch": 6.955547254389241, "step": 9310}, {"loss": 1.2513, "grad_norm": 1.202369213104248, "learning_rate": 0.0002, "epoch": 6.963018304071722, "step": 9320}, {"loss": 1.1922, "grad_norm": 1.1067225933074951, "learning_rate": 0.0002, "epoch": 6.9704893537542025, "step": 9330}, {"loss": 1.1965, "grad_norm": 1.0258643627166748, "learning_rate": 0.0002, "epoch": 6.977960403436683, "step": 9340}, {"loss": 1.2053, "grad_norm": 1.3311655521392822, "learning_rate": 0.0002, "epoch": 6.985431453119164, "step": 9350}, {"loss": 1.1778, "grad_norm": 1.1245559453964233, "learning_rate": 0.0002, "epoch": 6.992902502801644, "step": 9360}, {"eval_loss": 2.128103017807007, "eval_runtime": 39.1339, "eval_samples_per_second": 13.16, "eval_steps_per_second": 1.661, "epoch": 6.999626447515876, "step": 9369}, {"loss": 1.1782, "grad_norm": 1.0868251323699951, "learning_rate": 0.0002, "epoch": 7.000373552484124, "step": 9370}, {"loss": 1.0133, "grad_norm": 1.5252128839492798, "learning_rate": 0.0002, "epoch": 7.007844602166604, "step": 9380}, {"loss": 0.9364, "grad_norm": 1.1230034828186035, "learning_rate": 0.0002, "epoch": 7.015315651849085, "step": 9390}, {"loss": 0.9702, "grad_norm": 1.275871992111206, "learning_rate": 0.0002, "epoch": 7.022786701531565, "step": 9400}, {"loss": 0.9305, "grad_norm": 1.462963342666626, "learning_rate": 0.0002, "epoch": 7.030257751214045, "step": 9410}, {"loss": 0.9329, "grad_norm": 1.0506054162979126, "learning_rate": 0.0002, "epoch": 7.0377288008965255, "step": 9420}, {"loss": 0.9398, "grad_norm": 1.4315128326416016, "learning_rate": 0.0002, "epoch": 7.0451998505790066, "step": 9430}, {"loss": 0.9086, "grad_norm": 1.5143473148345947, "learning_rate": 0.0002, "epoch": 7.052670900261487, "step": 9440}, {"loss": 0.9712, "grad_norm": 1.2537293434143066, "learning_rate": 0.0002, "epoch": 7.060141949943967, "step": 9450}, {"loss": 0.9591, "grad_norm": 1.36807382106781, "learning_rate": 0.0002, "epoch": 7.067612999626448, "step": 9460}, {"loss": 1.0046, "grad_norm": 1.5365028381347656, "learning_rate": 0.0002, "epoch": 7.075084049308928, "step": 9470}, {"loss": 1.0045, "grad_norm": 1.227250576019287, "learning_rate": 0.0002, "epoch": 7.082555098991408, "step": 9480}, {"loss": 0.9745, "grad_norm": 1.6941372156143188, "learning_rate": 0.0002, "epoch": 7.090026148673888, "step": 9490}, {"loss": 0.9203, "grad_norm": 1.587410569190979, "learning_rate": 0.0002, "epoch": 7.097497198356369, "step": 9500}, {"loss": 0.9713, "grad_norm": 1.481272578239441, "learning_rate": 0.0002, "epoch": 7.104968248038849, "step": 9510}, {"loss": 1.0066, "grad_norm": 1.2331953048706055, "learning_rate": 0.0002, "epoch": 7.1124392977213295, "step": 9520}, {"loss": 0.9422, "grad_norm": 1.6446775197982788, "learning_rate": 0.0002, "epoch": 7.119910347403811, "step": 9530}, {"loss": 0.901, "grad_norm": 1.2055929899215698, "learning_rate": 0.0002, "epoch": 7.127381397086291, "step": 9540}, {"loss": 0.8959, "grad_norm": 1.119033932685852, "learning_rate": 0.0002, "epoch": 7.134852446768771, "step": 9550}, {"loss": 0.9586, "grad_norm": 1.712833046913147, "learning_rate": 0.0002, "epoch": 7.142323496451251, "step": 9560}, {"loss": 0.9326, "grad_norm": 1.2007980346679688, "learning_rate": 0.0002, "epoch": 7.149794546133732, "step": 9570}, {"loss": 0.9926, "grad_norm": 1.3251731395721436, "learning_rate": 0.0002, "epoch": 7.157265595816212, "step": 9580}, {"loss": 0.973, "grad_norm": 1.1897934675216675, "learning_rate": 0.0002, "epoch": 7.164736645498692, "step": 9590}, {"loss": 0.9401, "grad_norm": 1.348583698272705, "learning_rate": 0.0002, "epoch": 7.172207695181173, "step": 9600}, {"loss": 0.9931, "grad_norm": 1.1588937044143677, "learning_rate": 0.0002, "epoch": 7.179678744863653, "step": 9610}, {"loss": 0.9913, "grad_norm": 1.3808276653289795, "learning_rate": 0.0002, "epoch": 7.187149794546134, "step": 9620}, {"loss": 0.9955, "grad_norm": 1.552425503730774, "learning_rate": 0.0002, "epoch": 7.194620844228614, "step": 9630}, {"loss": 1.0255, "grad_norm": 1.3649828433990479, "learning_rate": 0.0002, "epoch": 7.202091893911095, "step": 9640}, {"loss": 0.973, "grad_norm": 1.3196533918380737, "learning_rate": 0.0002, "epoch": 7.209562943593575, "step": 9650}, {"loss": 1.0119, "grad_norm": 1.4874017238616943, "learning_rate": 0.0002, "epoch": 7.217033993276055, "step": 9660}, {"loss": 0.9778, "grad_norm": 1.2448325157165527, "learning_rate": 0.0002, "epoch": 7.224505042958536, "step": 9670}, {"loss": 0.9697, "grad_norm": 1.4631818532943726, "learning_rate": 0.0002, "epoch": 7.231976092641016, "step": 9680}, {"loss": 0.9827, "grad_norm": 1.2041361331939697, "learning_rate": 0.0002, "epoch": 7.239447142323496, "step": 9690}, {"loss": 0.9417, "grad_norm": 1.559156060218811, "learning_rate": 0.0002, "epoch": 7.246918192005976, "step": 9700}, {"loss": 1.0232, "grad_norm": 1.3939464092254639, "learning_rate": 0.0002, "epoch": 7.2543892416884574, "step": 9710}, {"loss": 1.005, "grad_norm": 1.347583293914795, "learning_rate": 0.0002, "epoch": 7.261860291370938, "step": 9720}, {"loss": 0.9256, "grad_norm": 1.302850365638733, "learning_rate": 0.0002, "epoch": 7.269331341053418, "step": 9730}, {"loss": 0.956, "grad_norm": 1.1425062417984009, "learning_rate": 0.0002, "epoch": 7.276802390735899, "step": 9740}, {"loss": 0.9978, "grad_norm": 1.2865869998931885, "learning_rate": 0.0002, "epoch": 7.284273440418379, "step": 9750}, {"loss": 0.9841, "grad_norm": 1.3773187398910522, "learning_rate": 0.0002, "epoch": 7.291744490100859, "step": 9760}, {"loss": 1.0063, "grad_norm": 1.2692701816558838, "learning_rate": 0.0002, "epoch": 7.299215539783339, "step": 9770}, {"loss": 1.0347, "grad_norm": 1.38542902469635, "learning_rate": 0.0002, "epoch": 7.30668658946582, "step": 9780}, {"loss": 0.9606, "grad_norm": 1.2204844951629639, "learning_rate": 0.0002, "epoch": 7.3141576391483, "step": 9790}, {"loss": 1.0225, "grad_norm": 1.4863795042037964, "learning_rate": 0.0002, "epoch": 7.32162868883078, "step": 9800}, {"loss": 0.9933, "grad_norm": 1.2458586692810059, "learning_rate": 0.0002, "epoch": 7.3290997385132615, "step": 9810}, {"loss": 1.0336, "grad_norm": 1.3530622720718384, "learning_rate": 0.0002, "epoch": 7.336570788195742, "step": 9820}, {"loss": 0.9319, "grad_norm": 1.2571991682052612, "learning_rate": 0.0002, "epoch": 7.344041837878222, "step": 9830}, {"loss": 1.0042, "grad_norm": 1.3074439764022827, "learning_rate": 0.0002, "epoch": 7.351512887560702, "step": 9840}, {"loss": 1.0433, "grad_norm": 1.2986950874328613, "learning_rate": 0.0002, "epoch": 7.358983937243183, "step": 9850}, {"loss": 1.0078, "grad_norm": 1.4233403205871582, "learning_rate": 0.0002, "epoch": 7.366454986925663, "step": 9860}, {"loss": 0.9359, "grad_norm": 1.468161702156067, "learning_rate": 0.0002, "epoch": 7.373926036608143, "step": 9870}, {"loss": 1.1074, "grad_norm": 1.354690432548523, "learning_rate": 0.0002, "epoch": 7.381397086290624, "step": 9880}, {"loss": 1.0153, "grad_norm": 1.4891324043273926, "learning_rate": 0.0002, "epoch": 7.388868135973104, "step": 9890}, {"loss": 1.0234, "grad_norm": 1.3470090627670288, "learning_rate": 0.0002, "epoch": 7.3963391856555845, "step": 9900}, {"loss": 1.063, "grad_norm": 1.373061180114746, "learning_rate": 0.0002, "epoch": 7.403810235338065, "step": 9910}, {"loss": 1.0109, "grad_norm": 1.4181641340255737, "learning_rate": 0.0002, "epoch": 7.411281285020546, "step": 9920}, {"loss": 0.9801, "grad_norm": 1.3284671306610107, "learning_rate": 0.0002, "epoch": 7.418752334703026, "step": 9930}, {"loss": 1.0859, "grad_norm": 1.333896517753601, "learning_rate": 0.0002, "epoch": 7.426223384385506, "step": 9940}, {"loss": 1.0056, "grad_norm": 1.6348158121109009, "learning_rate": 0.0002, "epoch": 7.433694434067987, "step": 9950}, {"loss": 1.0173, "grad_norm": 1.364643931388855, "learning_rate": 0.0002, "epoch": 7.441165483750467, "step": 9960}, {"loss": 1.0076, "grad_norm": 1.3974874019622803, "learning_rate": 0.0002, "epoch": 7.448636533432947, "step": 9970}, {"loss": 0.9918, "grad_norm": 1.5207233428955078, "learning_rate": 0.0002, "epoch": 7.456107583115427, "step": 9980}, {"loss": 1.019, "grad_norm": 1.541517734527588, "learning_rate": 0.0002, "epoch": 7.463578632797908, "step": 9990}, {"loss": 0.9904, "grad_norm": 1.3563939332962036, "learning_rate": 0.0002, "epoch": 7.4710496824803885, "step": 10000}, {"loss": 1.0285, "grad_norm": 1.3443987369537354, "learning_rate": 0.0002, "epoch": 7.478520732162869, "step": 10010}, {"loss": 1.0028, "grad_norm": 1.2904508113861084, "learning_rate": 0.0002, "epoch": 7.48599178184535, "step": 10020}, {"loss": 0.9949, "grad_norm": 1.434145450592041, "learning_rate": 0.0002, "epoch": 7.49346283152783, "step": 10030}, {"loss": 1.0837, "grad_norm": 1.4659384489059448, "learning_rate": 0.0002, "epoch": 7.50093388121031, "step": 10040}, {"loss": 1.0063, "grad_norm": 1.3430006504058838, "learning_rate": 0.0002, "epoch": 7.508404930892791, "step": 10050}, {"loss": 1.0168, "grad_norm": 1.3595343828201294, "learning_rate": 0.0002, "epoch": 7.515875980575271, "step": 10060}, {"loss": 0.9609, "grad_norm": 1.7456434965133667, "learning_rate": 0.0002, "epoch": 7.523347030257751, "step": 10070}, {"loss": 1.0843, "grad_norm": 1.329853892326355, "learning_rate": 0.0002, "epoch": 7.530818079940231, "step": 10080}, {"loss": 1.0879, "grad_norm": 1.548466682434082, "learning_rate": 0.0002, "epoch": 7.538289129622712, "step": 10090}, {"loss": 1.0221, "grad_norm": 1.2951644659042358, "learning_rate": 0.0002, "epoch": 7.5457601793051925, "step": 10100}, {"loss": 1.007, "grad_norm": 1.3988010883331299, "learning_rate": 0.0002, "epoch": 7.553231228987673, "step": 10110}, {"loss": 1.0353, "grad_norm": 1.211068868637085, "learning_rate": 0.0002, "epoch": 7.560702278670153, "step": 10120}, {"loss": 0.9962, "grad_norm": 1.2159098386764526, "learning_rate": 0.0002, "epoch": 7.568173328352634, "step": 10130}, {"loss": 0.9928, "grad_norm": 1.3533744812011719, "learning_rate": 0.0002, "epoch": 7.575644378035114, "step": 10140}, {"loss": 1.0386, "grad_norm": 1.3153362274169922, "learning_rate": 0.0002, "epoch": 7.583115427717594, "step": 10150}, {"loss": 1.0017, "grad_norm": 1.535762906074524, "learning_rate": 0.0002, "epoch": 7.590586477400075, "step": 10160}, {"loss": 1.0592, "grad_norm": 1.5531504154205322, "learning_rate": 0.0002, "epoch": 7.598057527082555, "step": 10170}, {"loss": 1.0684, "grad_norm": 1.3588606119155884, "learning_rate": 0.0002, "epoch": 7.605528576765035, "step": 10180}, {"loss": 1.0838, "grad_norm": 1.6648331880569458, "learning_rate": 0.0002, "epoch": 7.6129996264475155, "step": 10190}, {"loss": 1.04, "grad_norm": 1.250205159187317, "learning_rate": 0.0002, "epoch": 7.6204706761299965, "step": 10200}, {"loss": 1.0188, "grad_norm": 1.2442443370819092, "learning_rate": 0.0002, "epoch": 7.627941725812477, "step": 10210}, {"loss": 1.0297, "grad_norm": 1.386197805404663, "learning_rate": 0.0002, "epoch": 7.635412775494957, "step": 10220}, {"loss": 1.0233, "grad_norm": 1.3478381633758545, "learning_rate": 0.0002, "epoch": 7.642883825177438, "step": 10230}, {"loss": 1.0313, "grad_norm": 1.2800627946853638, "learning_rate": 0.0002, "epoch": 7.650354874859918, "step": 10240}, {"loss": 1.0252, "grad_norm": 1.4082499742507935, "learning_rate": 0.0002, "epoch": 7.657825924542398, "step": 10250}, {"loss": 1.0462, "grad_norm": 1.321529746055603, "learning_rate": 0.0002, "epoch": 7.665296974224878, "step": 10260}, {"loss": 1.0727, "grad_norm": 1.4213372468948364, "learning_rate": 0.0002, "epoch": 7.672768023907359, "step": 10270}, {"loss": 0.9938, "grad_norm": 1.5585565567016602, "learning_rate": 0.0002, "epoch": 7.680239073589839, "step": 10280}, {"loss": 1.021, "grad_norm": 1.4025108814239502, "learning_rate": 0.0002, "epoch": 7.6877101232723195, "step": 10290}, {"loss": 1.0136, "grad_norm": 1.344456434249878, "learning_rate": 0.0002, "epoch": 7.6951811729548005, "step": 10300}, {"loss": 1.0157, "grad_norm": 1.4962990283966064, "learning_rate": 0.0002, "epoch": 7.702652222637281, "step": 10310}, {"loss": 1.0621, "grad_norm": 1.4523862600326538, "learning_rate": 0.0002, "epoch": 7.710123272319761, "step": 10320}, {"loss": 1.0413, "grad_norm": 1.401842474937439, "learning_rate": 0.0002, "epoch": 7.717594322002241, "step": 10330}, {"loss": 1.0028, "grad_norm": 1.517730474472046, "learning_rate": 0.0002, "epoch": 7.725065371684722, "step": 10340}, {"loss": 1.0061, "grad_norm": 1.3876111507415771, "learning_rate": 0.0002, "epoch": 7.732536421367202, "step": 10350}, {"loss": 1.0071, "grad_norm": 1.5741353034973145, "learning_rate": 0.0002, "epoch": 7.740007471049682, "step": 10360}, {"loss": 1.0472, "grad_norm": 1.3465591669082642, "learning_rate": 0.0002, "epoch": 7.747478520732163, "step": 10370}, {"loss": 0.9961, "grad_norm": 1.3611412048339844, "learning_rate": 0.0002, "epoch": 7.754949570414643, "step": 10380}, {"loss": 1.0118, "grad_norm": 1.693565011024475, "learning_rate": 0.0002, "epoch": 7.7624206200971235, "step": 10390}, {"loss": 1.0981, "grad_norm": 1.4654128551483154, "learning_rate": 0.0002, "epoch": 7.769891669779604, "step": 10400}, {"loss": 1.029, "grad_norm": 1.417768955230713, "learning_rate": 0.0002, "epoch": 7.777362719462085, "step": 10410}, {"loss": 1.0218, "grad_norm": 1.3143322467803955, "learning_rate": 0.0002, "epoch": 7.784833769144565, "step": 10420}, {"loss": 1.0224, "grad_norm": 1.3467497825622559, "learning_rate": 0.0002, "epoch": 7.792304818827045, "step": 10430}, {"loss": 1.0555, "grad_norm": 1.223697543144226, "learning_rate": 0.0002, "epoch": 7.799775868509526, "step": 10440}, {"loss": 1.0198, "grad_norm": 1.3060917854309082, "learning_rate": 0.0002, "epoch": 7.807246918192006, "step": 10450}, {"loss": 1.0896, "grad_norm": 1.5561134815216064, "learning_rate": 0.0002, "epoch": 7.814717967874486, "step": 10460}, {"loss": 1.0981, "grad_norm": 1.2789647579193115, "learning_rate": 0.0002, "epoch": 7.822189017556966, "step": 10470}, {"loss": 1.0549, "grad_norm": 1.2422796487808228, "learning_rate": 0.0002, "epoch": 7.829660067239447, "step": 10480}, {"loss": 1.0255, "grad_norm": 1.377565622329712, "learning_rate": 0.0002, "epoch": 7.8371311169219275, "step": 10490}, {"loss": 1.0864, "grad_norm": 1.2221037149429321, "learning_rate": 0.0002, "epoch": 7.844602166604408, "step": 10500}, {"loss": 1.0944, "grad_norm": 1.3779186010360718, "learning_rate": 0.0002, "epoch": 7.852073216286888, "step": 10510}, {"loss": 1.0694, "grad_norm": 1.3062539100646973, "learning_rate": 0.0002, "epoch": 7.859544265969369, "step": 10520}, {"loss": 1.048, "grad_norm": 1.4066052436828613, "learning_rate": 0.0002, "epoch": 7.867015315651849, "step": 10530}, {"loss": 1.0674, "grad_norm": 1.6326613426208496, "learning_rate": 0.0002, "epoch": 7.874486365334329, "step": 10540}, {"loss": 1.1237, "grad_norm": 1.1732137203216553, "learning_rate": 0.0002, "epoch": 7.88195741501681, "step": 10550}, {"loss": 1.0682, "grad_norm": 1.303125023841858, "learning_rate": 0.0002, "epoch": 7.88942846469929, "step": 10560}, {"loss": 1.0524, "grad_norm": 1.294990062713623, "learning_rate": 0.0002, "epoch": 7.89689951438177, "step": 10570}, {"loss": 1.0577, "grad_norm": 1.4719983339309692, "learning_rate": 0.0002, "epoch": 7.904370564064251, "step": 10580}, {"loss": 1.0397, "grad_norm": 1.4117742776870728, "learning_rate": 0.0002, "epoch": 7.911841613746732, "step": 10590}, {"loss": 1.1129, "grad_norm": 1.384812355041504, "learning_rate": 0.0002, "epoch": 7.919312663429212, "step": 10600}, {"loss": 0.9994, "grad_norm": 1.5743740797042847, "learning_rate": 0.0002, "epoch": 7.926783713111692, "step": 10610}, {"loss": 1.0168, "grad_norm": 1.2799863815307617, "learning_rate": 0.0002, "epoch": 7.934254762794173, "step": 10620}, {"loss": 1.0893, "grad_norm": 1.4822591543197632, "learning_rate": 0.0002, "epoch": 7.941725812476653, "step": 10630}, {"loss": 1.0362, "grad_norm": 1.4634777307510376, "learning_rate": 0.0002, "epoch": 7.949196862159133, "step": 10640}, {"loss": 1.0116, "grad_norm": 1.5230964422225952, "learning_rate": 0.0002, "epoch": 7.956667911841613, "step": 10650}, {"loss": 1.0582, "grad_norm": 1.3622701168060303, "learning_rate": 0.0002, "epoch": 7.964138961524094, "step": 10660}, {"loss": 1.1127, "grad_norm": 1.2133928537368774, "learning_rate": 0.0002, "epoch": 7.971610011206574, "step": 10670}, {"loss": 1.096, "grad_norm": 1.2852206230163574, "learning_rate": 0.0002, "epoch": 7.9790810608890546, "step": 10680}, {"loss": 1.0286, "grad_norm": 1.243310570716858, "learning_rate": 0.0002, "epoch": 7.986552110571536, "step": 10690}, {"loss": 1.053, "grad_norm": 1.459757924079895, "learning_rate": 0.0002, "epoch": 7.994023160254016, "step": 10700}]}