diff --git a/.gitattributes b/.gitattributes index ccf71c32616e95b5c9e3d106ced333ea4ed14b7f..1ff99891d8950285127945c6b0eda2ba05383baa 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3116,3 +3116,12 @@ Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-1/checkpoint-49504/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-1/checkpoint-6188/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2769e8e1cf584df4dfc7285f967e94a6da2513fa --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e1eaac364e8b495805cd2edf3b38ef0ed7541e15acdd0aa2e53fcb8290fc3f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2e75dcda8a0f1289bb1d94f520d3eb1b1422d3c4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbaaa848c3d0b7be374600ba8128496196592987f07a06a105d7fa706aee37a1 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2586964afa908a82cc4e9bbe43524c32c5e0724 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc79fca637d958631906b62c6f9962effb1009ba43d04f7f9fe53a40e289358 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..552080984b33faab29dfd89386abad344a59b88f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4993c0e1eb2a081b16ab01707c54b9da508ca86e58657d145a099684101f12fa +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..179a43ce4ee9163d24663c18e43135da3b0c5fee --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11e982f5c73eb22906d063a0026aaebaf3eafe7c0db23164f0a172a0dfcb7bd +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2f49316e3237df30f83ffcba62903880776dca43 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/trainer_state.json @@ -0,0 +1,825 @@ +{ + "best_metric": 1.8095673322677612, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129", + "epoch": 0.9995573262505534, + "eval_steps": 10, + "global_step": 1129, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008853474988933156, + "grad_norm": 0.4775333106517792, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 10 + }, + { + "epoch": 0.017706949977866312, + "grad_norm": 0.5485824346542358, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 20 + }, + { + "epoch": 0.02656042496679947, + "grad_norm": 0.5675218105316162, + "learning_rate": 0.0002, + "loss": 2.0984, + "step": 30 + }, + { + "epoch": 0.035413899955732624, + "grad_norm": 0.696494460105896, + "learning_rate": 0.0002, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.04426737494466578, + "grad_norm": 0.4788398742675781, + "learning_rate": 0.0002, + "loss": 1.9547, + "step": 50 + }, + { + "epoch": 0.05312084993359894, + "grad_norm": 0.4763128161430359, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 60 + }, + { + "epoch": 0.0619743249225321, + "grad_norm": 0.5929698348045349, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 70 + }, + { + "epoch": 0.07082779991146525, + "grad_norm": 0.5899396538734436, + "learning_rate": 0.0002, + "loss": 1.9573, + "step": 80 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.460123747587204, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 90 + }, + { + "epoch": 0.08853474988933156, + "grad_norm": 0.4184812009334564, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 100 + }, + { + "epoch": 0.09738822487826472, + "grad_norm": 0.4051891267299652, + "learning_rate": 0.0002, + "loss": 1.8079, + "step": 110 + }, + { + "epoch": 0.10624169986719788, + "grad_norm": 0.3709661066532135, + "learning_rate": 0.0002, + "loss": 1.8911, + "step": 120 + }, + { + "epoch": 0.11509517485613104, + "grad_norm": 0.4783487915992737, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 130 + }, + { + "epoch": 0.1239486498450642, + "grad_norm": 0.36478137969970703, + "learning_rate": 0.0002, + "loss": 1.8602, + "step": 140 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.4005294442176819, + "learning_rate": 0.0002, + "loss": 1.7814, + "step": 150 + }, + { + "epoch": 0.1416555998229305, + "grad_norm": 0.42357513308525085, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 160 + }, + { + "epoch": 0.15050907481186365, + "grad_norm": 0.3913971781730652, + "learning_rate": 0.0002, + "loss": 1.8835, + "step": 170 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.4650019407272339, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 180 + }, + { + "epoch": 0.16821602478972997, + "grad_norm": 0.5545958876609802, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 190 + }, + { + "epoch": 0.17706949977866313, + "grad_norm": 0.3669356107711792, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 200 + }, + { + "epoch": 0.18592297476759628, + "grad_norm": 0.3683622181415558, + "learning_rate": 0.0002, + "loss": 1.8169, + "step": 210 + }, + { + "epoch": 0.19477644975652944, + "grad_norm": 0.39825671911239624, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 220 + }, + { + "epoch": 0.2036299247454626, + "grad_norm": 0.4298318326473236, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 230 + }, + { + "epoch": 0.21248339973439576, + "grad_norm": 0.36111244559288025, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 240 + }, + { + "epoch": 0.2213368747233289, + "grad_norm": 0.3711858093738556, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 250 + }, + { + "epoch": 0.23019034971226207, + "grad_norm": 0.37717559933662415, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 260 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.3678877651691437, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 270 + }, + { + "epoch": 0.2478972996901284, + "grad_norm": 0.4165912866592407, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 280 + }, + { + "epoch": 0.25675077467906154, + "grad_norm": 0.3403240740299225, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 290 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.4023234248161316, + "learning_rate": 0.0002, + "loss": 1.8704, + "step": 300 + }, + { + "epoch": 0.27445772465692786, + "grad_norm": 0.32472360134124756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 310 + }, + { + "epoch": 0.283311199645861, + "grad_norm": 0.36464595794677734, + "learning_rate": 0.0002, + "loss": 1.8544, + "step": 320 + }, + { + "epoch": 0.2921646746347942, + "grad_norm": 0.3868598937988281, + "learning_rate": 0.0002, + "loss": 1.8168, + "step": 330 + }, + { + "epoch": 0.3010181496237273, + "grad_norm": 0.3123539686203003, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 340 + }, + { + "epoch": 0.3098716246126605, + "grad_norm": 0.3392639458179474, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 350 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.42070651054382324, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 360 + }, + { + "epoch": 0.3275785745905268, + "grad_norm": 0.3650900423526764, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 370 + }, + { + "epoch": 0.33643204957945994, + "grad_norm": 0.41388973593711853, + "learning_rate": 0.0002, + "loss": 1.8388, + "step": 380 + }, + { + "epoch": 0.3452855245683931, + "grad_norm": 0.36625272035598755, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 390 + }, + { + "epoch": 0.35413899955732625, + "grad_norm": 0.3930284082889557, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 400 + }, + { + "epoch": 0.3629924745462594, + "grad_norm": 0.3415820300579071, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 410 + }, + { + "epoch": 0.37184594953519257, + "grad_norm": 0.4256570041179657, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 420 + }, + { + "epoch": 0.3806994245241257, + "grad_norm": 0.3740842938423157, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 430 + }, + { + "epoch": 0.3895528995130589, + "grad_norm": 0.334108829498291, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 440 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.33186739683151245, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 450 + }, + { + "epoch": 0.4072598494909252, + "grad_norm": 0.39127954840660095, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 460 + }, + { + "epoch": 0.4161133244798583, + "grad_norm": 0.331443727016449, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 470 + }, + { + "epoch": 0.4249667994687915, + "grad_norm": 0.36834150552749634, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 480 + }, + { + "epoch": 0.43382027445772464, + "grad_norm": 0.338123619556427, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 490 + }, + { + "epoch": 0.4426737494466578, + "grad_norm": 0.3891060948371887, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 500 + }, + { + "epoch": 0.45152722443559096, + "grad_norm": 0.3486529290676117, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 510 + }, + { + "epoch": 0.46038069942452414, + "grad_norm": 0.3635135889053345, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 520 + }, + { + "epoch": 0.4692341744134573, + "grad_norm": 0.7706693410873413, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 530 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.33725443482398987, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 540 + }, + { + "epoch": 0.4869411243913236, + "grad_norm": 0.3127504289150238, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 550 + }, + { + "epoch": 0.4957945993802568, + "grad_norm": 0.3527977466583252, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 560 + }, + { + "epoch": 0.5046480743691899, + "grad_norm": 0.3574548661708832, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 570 + }, + { + "epoch": 0.5135015493581231, + "grad_norm": 0.32787248492240906, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 580 + }, + { + "epoch": 0.5223550243470563, + "grad_norm": 0.3309430778026581, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 590 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 0.34276407957077026, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 600 + }, + { + "epoch": 0.5400619743249225, + "grad_norm": 0.3343711495399475, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 610 + }, + { + "epoch": 0.5489154493138557, + "grad_norm": 0.3193040192127228, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 620 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.3059828579425812, + "learning_rate": 0.0002, + "loss": 1.7769, + "step": 630 + }, + { + "epoch": 0.566622399291722, + "grad_norm": 0.37237173318862915, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 640 + }, + { + "epoch": 0.5754758742806552, + "grad_norm": 0.36022549867630005, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 650 + }, + { + "epoch": 0.5843293492695883, + "grad_norm": 0.34974920749664307, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 660 + }, + { + "epoch": 0.5931828242585214, + "grad_norm": 0.37135401368141174, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.6020362992474546, + "grad_norm": 0.3385699689388275, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 680 + }, + { + "epoch": 0.6108897742363878, + "grad_norm": 0.36015814542770386, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 690 + }, + { + "epoch": 0.619743249225321, + "grad_norm": 0.3503795564174652, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 700 + }, + { + "epoch": 0.628596724214254, + "grad_norm": 0.3447190225124359, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 710 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.3193499445915222, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 720 + }, + { + "epoch": 0.6463036741921204, + "grad_norm": 0.37058180570602417, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 730 + }, + { + "epoch": 0.6551571491810536, + "grad_norm": 0.42216411232948303, + "learning_rate": 0.0002, + "loss": 1.8391, + "step": 740 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.3091185688972473, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 750 + }, + { + "epoch": 0.6728640991589199, + "grad_norm": 0.33168601989746094, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 760 + }, + { + "epoch": 0.6817175741478531, + "grad_norm": 0.31269341707229614, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 770 + }, + { + "epoch": 0.6905710491367862, + "grad_norm": 0.36125293374061584, + "learning_rate": 0.0002, + "loss": 1.8526, + "step": 780 + }, + { + "epoch": 0.6994245241257193, + "grad_norm": 0.3145293593406677, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 790 + }, + { + "epoch": 0.7082779991146525, + "grad_norm": 0.3611990809440613, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.3165971636772156, + "learning_rate": 0.0002, + "loss": 1.892, + "step": 810 + }, + { + "epoch": 0.7259849490925188, + "grad_norm": 0.3364323675632477, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 820 + }, + { + "epoch": 0.734838424081452, + "grad_norm": 0.4310600757598877, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 830 + }, + { + "epoch": 0.7436918990703851, + "grad_norm": 0.3414389491081238, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 840 + }, + { + "epoch": 0.7525453740593183, + "grad_norm": 0.35536202788352966, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 850 + }, + { + "epoch": 0.7613988490482514, + "grad_norm": 0.3232460618019104, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 860 + }, + { + "epoch": 0.7702523240371846, + "grad_norm": 0.32734858989715576, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 870 + }, + { + "epoch": 0.7791057990261178, + "grad_norm": 0.3433493673801422, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 880 + }, + { + "epoch": 0.787959274015051, + "grad_norm": 0.33354780077934265, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 890 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.30728545784950256, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 900 + }, + { + "epoch": 0.8056662239929172, + "grad_norm": 0.3373030126094818, + "learning_rate": 0.0002, + "loss": 1.8267, + "step": 910 + }, + { + "epoch": 0.8145196989818504, + "grad_norm": 0.3468782603740692, + "learning_rate": 0.0002, + "loss": 1.8479, + "step": 920 + }, + { + "epoch": 0.8233731739707836, + "grad_norm": 0.33520200848579407, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 930 + }, + { + "epoch": 0.8322266489597167, + "grad_norm": 0.35207098722457886, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 0.8410801239486498, + "grad_norm": 0.4000207483768463, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 950 + }, + { + "epoch": 0.849933598937583, + "grad_norm": 0.35362836718559265, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 960 + }, + { + "epoch": 0.8587870739265162, + "grad_norm": 0.3470745086669922, + "learning_rate": 0.0002, + "loss": 1.7497, + "step": 970 + }, + { + "epoch": 0.8676405489154493, + "grad_norm": 0.31602704524993896, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 980 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.3062942326068878, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 990 + }, + { + "epoch": 0.8853474988933157, + "grad_norm": 0.36963850259780884, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1000 + }, + { + "epoch": 0.8942009738822487, + "grad_norm": 0.3384034037590027, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1010 + }, + { + "epoch": 0.9030544488711819, + "grad_norm": 0.30436110496520996, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 1020 + }, + { + "epoch": 0.9119079238601151, + "grad_norm": 3.499784469604492, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1030 + }, + { + "epoch": 0.9207613988490483, + "grad_norm": 0.3130280375480652, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1040 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.29976674914360046, + "learning_rate": 0.0002, + "loss": 1.7527, + "step": 1050 + }, + { + "epoch": 0.9384683488269145, + "grad_norm": 0.35852617025375366, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 1060 + }, + { + "epoch": 0.9473218238158477, + "grad_norm": 0.3288591504096985, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1070 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.32641634345054626, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1080 + }, + { + "epoch": 0.965028773793714, + "grad_norm": 0.3305715322494507, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1090 + }, + { + "epoch": 0.9738822487826472, + "grad_norm": 0.30650773644447327, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1100 + }, + { + "epoch": 0.9827357237715804, + "grad_norm": 0.3330624997615814, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1110 + }, + { + "epoch": 0.9915891987605135, + "grad_norm": 0.3173314034938812, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 1120 + }, + { + "epoch": 0.9995573262505534, + "eval_loss": 1.8095673322677612, + "eval_runtime": 82.6312, + "eval_samples_per_second": 6.233, + "eval_steps_per_second": 0.787, + "step": 1129 + } + ], + "logging_steps": 10, + "max_steps": 9032, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.22707309470679e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2769e8e1cf584df4dfc7285f967e94a6da2513fa --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e1eaac364e8b495805cd2edf3b38ef0ed7541e15acdd0aa2e53fcb8290fc3f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e138e27ca0080a8dc28dd3fd7b2f06bdb6dfee28 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aff39d300e4978dce38a3fa60770956be5cf7c1c4fb95a0087df1a918214883b +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec8f05554ae859b230ba5af4865fbd8cf1cd3e62 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84b53078a101cd714a49b9b41583b05a498a6d75cbcc28f067522b09c5298582 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a0f63f8c75afaeebc1221ff0117c96333346bcb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dfc8fa3369f4b363e0ad9fb335948fd525eb75928c1e942404239cf491db2a8 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..42350d2c98e30c96ca006cc38c119893931f8d98 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/trainer_state.json @@ -0,0 +1,1624 @@ +{ + "best_metric": 1.8077166080474854, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2259, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008853474988933156, + "grad_norm": 0.4775333106517792, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 10 + }, + { + "epoch": 0.017706949977866312, + "grad_norm": 0.5485824346542358, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 20 + }, + { + "epoch": 0.02656042496679947, + "grad_norm": 0.5675218105316162, + "learning_rate": 0.0002, + "loss": 2.0984, + "step": 30 + }, + { + "epoch": 0.035413899955732624, + "grad_norm": 0.696494460105896, + "learning_rate": 0.0002, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.04426737494466578, + "grad_norm": 0.4788398742675781, + "learning_rate": 0.0002, + "loss": 1.9547, + "step": 50 + }, + { + "epoch": 0.05312084993359894, + "grad_norm": 0.4763128161430359, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 60 + }, + { + "epoch": 0.0619743249225321, + "grad_norm": 0.5929698348045349, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 70 + }, + { + "epoch": 0.07082779991146525, + "grad_norm": 0.5899396538734436, + "learning_rate": 0.0002, + "loss": 1.9573, + "step": 80 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.460123747587204, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 90 + }, + { + "epoch": 0.08853474988933156, + "grad_norm": 0.4184812009334564, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 100 + }, + { + "epoch": 0.09738822487826472, + "grad_norm": 0.4051891267299652, + "learning_rate": 0.0002, + "loss": 1.8079, + "step": 110 + }, + { + "epoch": 0.10624169986719788, + "grad_norm": 0.3709661066532135, + "learning_rate": 0.0002, + "loss": 1.8911, + "step": 120 + }, + { + "epoch": 0.11509517485613104, + "grad_norm": 0.4783487915992737, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 130 + }, + { + "epoch": 0.1239486498450642, + "grad_norm": 0.36478137969970703, + "learning_rate": 0.0002, + "loss": 1.8602, + "step": 140 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.4005294442176819, + "learning_rate": 0.0002, + "loss": 1.7814, + "step": 150 + }, + { + "epoch": 0.1416555998229305, + "grad_norm": 0.42357513308525085, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 160 + }, + { + "epoch": 0.15050907481186365, + "grad_norm": 0.3913971781730652, + "learning_rate": 0.0002, + "loss": 1.8835, + "step": 170 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.4650019407272339, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 180 + }, + { + "epoch": 0.16821602478972997, + "grad_norm": 0.5545958876609802, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 190 + }, + { + "epoch": 0.17706949977866313, + "grad_norm": 0.3669356107711792, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 200 + }, + { + "epoch": 0.18592297476759628, + "grad_norm": 0.3683622181415558, + "learning_rate": 0.0002, + "loss": 1.8169, + "step": 210 + }, + { + "epoch": 0.19477644975652944, + "grad_norm": 0.39825671911239624, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 220 + }, + { + "epoch": 0.2036299247454626, + "grad_norm": 0.4298318326473236, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 230 + }, + { + "epoch": 0.21248339973439576, + "grad_norm": 0.36111244559288025, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 240 + }, + { + "epoch": 0.2213368747233289, + "grad_norm": 0.3711858093738556, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 250 + }, + { + "epoch": 0.23019034971226207, + "grad_norm": 0.37717559933662415, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 260 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.3678877651691437, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 270 + }, + { + "epoch": 0.2478972996901284, + "grad_norm": 0.4165912866592407, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 280 + }, + { + "epoch": 0.25675077467906154, + "grad_norm": 0.3403240740299225, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 290 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.4023234248161316, + "learning_rate": 0.0002, + "loss": 1.8704, + "step": 300 + }, + { + "epoch": 0.27445772465692786, + "grad_norm": 0.32472360134124756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 310 + }, + { + "epoch": 0.283311199645861, + "grad_norm": 0.36464595794677734, + "learning_rate": 0.0002, + "loss": 1.8544, + "step": 320 + }, + { + "epoch": 0.2921646746347942, + "grad_norm": 0.3868598937988281, + "learning_rate": 0.0002, + "loss": 1.8168, + "step": 330 + }, + { + "epoch": 0.3010181496237273, + "grad_norm": 0.3123539686203003, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 340 + }, + { + "epoch": 0.3098716246126605, + "grad_norm": 0.3392639458179474, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 350 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.42070651054382324, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 360 + }, + { + "epoch": 0.3275785745905268, + "grad_norm": 0.3650900423526764, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 370 + }, + { + "epoch": 0.33643204957945994, + "grad_norm": 0.41388973593711853, + "learning_rate": 0.0002, + "loss": 1.8388, + "step": 380 + }, + { + "epoch": 0.3452855245683931, + "grad_norm": 0.36625272035598755, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 390 + }, + { + "epoch": 0.35413899955732625, + "grad_norm": 0.3930284082889557, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 400 + }, + { + "epoch": 0.3629924745462594, + "grad_norm": 0.3415820300579071, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 410 + }, + { + "epoch": 0.37184594953519257, + "grad_norm": 0.4256570041179657, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 420 + }, + { + "epoch": 0.3806994245241257, + "grad_norm": 0.3740842938423157, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 430 + }, + { + "epoch": 0.3895528995130589, + "grad_norm": 0.334108829498291, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 440 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.33186739683151245, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 450 + }, + { + "epoch": 0.4072598494909252, + "grad_norm": 0.39127954840660095, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 460 + }, + { + "epoch": 0.4161133244798583, + "grad_norm": 0.331443727016449, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 470 + }, + { + "epoch": 0.4249667994687915, + "grad_norm": 0.36834150552749634, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 480 + }, + { + "epoch": 0.43382027445772464, + "grad_norm": 0.338123619556427, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 490 + }, + { + "epoch": 0.4426737494466578, + "grad_norm": 0.3891060948371887, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 500 + }, + { + "epoch": 0.45152722443559096, + "grad_norm": 0.3486529290676117, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 510 + }, + { + "epoch": 0.46038069942452414, + "grad_norm": 0.3635135889053345, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 520 + }, + { + "epoch": 0.4692341744134573, + "grad_norm": 0.7706693410873413, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 530 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.33725443482398987, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 540 + }, + { + "epoch": 0.4869411243913236, + "grad_norm": 0.3127504289150238, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 550 + }, + { + "epoch": 0.4957945993802568, + "grad_norm": 0.3527977466583252, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 560 + }, + { + "epoch": 0.5046480743691899, + "grad_norm": 0.3574548661708832, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 570 + }, + { + "epoch": 0.5135015493581231, + "grad_norm": 0.32787248492240906, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 580 + }, + { + "epoch": 0.5223550243470563, + "grad_norm": 0.3309430778026581, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 590 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 0.34276407957077026, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 600 + }, + { + "epoch": 0.5400619743249225, + "grad_norm": 0.3343711495399475, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 610 + }, + { + "epoch": 0.5489154493138557, + "grad_norm": 0.3193040192127228, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 620 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.3059828579425812, + "learning_rate": 0.0002, + "loss": 1.7769, + "step": 630 + }, + { + "epoch": 0.566622399291722, + "grad_norm": 0.37237173318862915, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 640 + }, + { + "epoch": 0.5754758742806552, + "grad_norm": 0.36022549867630005, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 650 + }, + { + "epoch": 0.5843293492695883, + "grad_norm": 0.34974920749664307, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 660 + }, + { + "epoch": 0.5931828242585214, + "grad_norm": 0.37135401368141174, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.6020362992474546, + "grad_norm": 0.3385699689388275, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 680 + }, + { + "epoch": 0.6108897742363878, + "grad_norm": 0.36015814542770386, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 690 + }, + { + "epoch": 0.619743249225321, + "grad_norm": 0.3503795564174652, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 700 + }, + { + "epoch": 0.628596724214254, + "grad_norm": 0.3447190225124359, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 710 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.3193499445915222, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 720 + }, + { + "epoch": 0.6463036741921204, + "grad_norm": 0.37058180570602417, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 730 + }, + { + "epoch": 0.6551571491810536, + "grad_norm": 0.42216411232948303, + "learning_rate": 0.0002, + "loss": 1.8391, + "step": 740 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.3091185688972473, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 750 + }, + { + "epoch": 0.6728640991589199, + "grad_norm": 0.33168601989746094, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 760 + }, + { + "epoch": 0.6817175741478531, + "grad_norm": 0.31269341707229614, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 770 + }, + { + "epoch": 0.6905710491367862, + "grad_norm": 0.36125293374061584, + "learning_rate": 0.0002, + "loss": 1.8526, + "step": 780 + }, + { + "epoch": 0.6994245241257193, + "grad_norm": 0.3145293593406677, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 790 + }, + { + "epoch": 0.7082779991146525, + "grad_norm": 0.3611990809440613, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.3165971636772156, + "learning_rate": 0.0002, + "loss": 1.892, + "step": 810 + }, + { + "epoch": 0.7259849490925188, + "grad_norm": 0.3364323675632477, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 820 + }, + { + "epoch": 0.734838424081452, + "grad_norm": 0.4310600757598877, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 830 + }, + { + "epoch": 0.7436918990703851, + "grad_norm": 0.3414389491081238, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 840 + }, + { + "epoch": 0.7525453740593183, + "grad_norm": 0.35536202788352966, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 850 + }, + { + "epoch": 0.7613988490482514, + "grad_norm": 0.3232460618019104, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 860 + }, + { + "epoch": 0.7702523240371846, + "grad_norm": 0.32734858989715576, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 870 + }, + { + "epoch": 0.7791057990261178, + "grad_norm": 0.3433493673801422, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 880 + }, + { + "epoch": 0.787959274015051, + "grad_norm": 0.33354780077934265, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 890 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.30728545784950256, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 900 + }, + { + "epoch": 0.8056662239929172, + "grad_norm": 0.3373030126094818, + "learning_rate": 0.0002, + "loss": 1.8267, + "step": 910 + }, + { + "epoch": 0.8145196989818504, + "grad_norm": 0.3468782603740692, + "learning_rate": 0.0002, + "loss": 1.8479, + "step": 920 + }, + { + "epoch": 0.8233731739707836, + "grad_norm": 0.33520200848579407, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 930 + }, + { + "epoch": 0.8322266489597167, + "grad_norm": 0.35207098722457886, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 0.8410801239486498, + "grad_norm": 0.4000207483768463, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 950 + }, + { + "epoch": 0.849933598937583, + "grad_norm": 0.35362836718559265, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 960 + }, + { + "epoch": 0.8587870739265162, + "grad_norm": 0.3470745086669922, + "learning_rate": 0.0002, + "loss": 1.7497, + "step": 970 + }, + { + "epoch": 0.8676405489154493, + "grad_norm": 0.31602704524993896, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 980 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.3062942326068878, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 990 + }, + { + "epoch": 0.8853474988933157, + "grad_norm": 0.36963850259780884, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1000 + }, + { + "epoch": 0.8942009738822487, + "grad_norm": 0.3384034037590027, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1010 + }, + { + "epoch": 0.9030544488711819, + "grad_norm": 0.30436110496520996, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 1020 + }, + { + "epoch": 0.9119079238601151, + "grad_norm": 3.499784469604492, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1030 + }, + { + "epoch": 0.9207613988490483, + "grad_norm": 0.3130280375480652, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1040 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.29976674914360046, + "learning_rate": 0.0002, + "loss": 1.7527, + "step": 1050 + }, + { + "epoch": 0.9384683488269145, + "grad_norm": 0.35852617025375366, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 1060 + }, + { + "epoch": 0.9473218238158477, + "grad_norm": 0.3288591504096985, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1070 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.32641634345054626, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1080 + }, + { + "epoch": 0.965028773793714, + "grad_norm": 0.3305715322494507, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1090 + }, + { + "epoch": 0.9738822487826472, + "grad_norm": 0.30650773644447327, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1100 + }, + { + "epoch": 0.9827357237715804, + "grad_norm": 0.3330624997615814, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1110 + }, + { + "epoch": 0.9915891987605135, + "grad_norm": 0.3173314034938812, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 1120 + }, + { + "epoch": 0.9995573262505534, + "eval_loss": 1.8095673322677612, + "eval_runtime": 82.6312, + "eval_samples_per_second": 6.233, + "eval_steps_per_second": 0.787, + "step": 1129 + }, + { + "epoch": 1.0004426737494467, + "grad_norm": 0.3092995882034302, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1130 + }, + { + "epoch": 1.0092961487383798, + "grad_norm": 0.34386494755744934, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1140 + }, + { + "epoch": 1.0181496237273129, + "grad_norm": 0.2887897789478302, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 1150 + }, + { + "epoch": 1.0270030987162462, + "grad_norm": 0.3706893026828766, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1160 + }, + { + "epoch": 1.0358565737051793, + "grad_norm": 0.34724316000938416, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 1170 + }, + { + "epoch": 1.0447100486941125, + "grad_norm": 0.41001757979393005, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1180 + }, + { + "epoch": 1.0535635236830456, + "grad_norm": 0.34838348627090454, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 1190 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 0.37201181054115295, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1200 + }, + { + "epoch": 1.071270473660912, + "grad_norm": 0.36871352791786194, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1210 + }, + { + "epoch": 1.080123948649845, + "grad_norm": 0.35687458515167236, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 1220 + }, + { + "epoch": 1.0889774236387781, + "grad_norm": 0.3864741921424866, + "learning_rate": 0.0002, + "loss": 1.7235, + "step": 1230 + }, + { + "epoch": 1.0978308986277114, + "grad_norm": 0.3496808707714081, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1240 + }, + { + "epoch": 1.1066843736166445, + "grad_norm": 0.3444930911064148, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 1250 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.353188693523407, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1260 + }, + { + "epoch": 1.1243913235945109, + "grad_norm": 0.3284400999546051, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1270 + }, + { + "epoch": 1.133244798583444, + "grad_norm": 0.3545348644256592, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1280 + }, + { + "epoch": 1.1420982735723773, + "grad_norm": 0.3489900529384613, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1290 + }, + { + "epoch": 1.1509517485613103, + "grad_norm": 0.40355560183525085, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 1300 + }, + { + "epoch": 1.1598052235502434, + "grad_norm": 0.3369944095611572, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 1310 + }, + { + "epoch": 1.1686586985391767, + "grad_norm": 0.39141345024108887, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1320 + }, + { + "epoch": 1.1775121735281098, + "grad_norm": 0.36518552899360657, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 1330 + }, + { + "epoch": 1.1863656485170428, + "grad_norm": 0.3730056583881378, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1340 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 0.37711501121520996, + "learning_rate": 0.0002, + "loss": 1.7613, + "step": 1350 + }, + { + "epoch": 1.2040725984949092, + "grad_norm": 0.3627128005027771, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 1360 + }, + { + "epoch": 1.2129260734838425, + "grad_norm": 0.3458651006221771, + "learning_rate": 0.0002, + "loss": 1.7214, + "step": 1370 + }, + { + "epoch": 1.2217795484727756, + "grad_norm": 0.392395555973053, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1380 + }, + { + "epoch": 1.2306330234617087, + "grad_norm": 0.3353286683559418, + "learning_rate": 0.0002, + "loss": 1.7785, + "step": 1390 + }, + { + "epoch": 1.239486498450642, + "grad_norm": 0.9545007944107056, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 1400 + }, + { + "epoch": 1.248339973439575, + "grad_norm": 0.37037935853004456, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1410 + }, + { + "epoch": 1.257193448428508, + "grad_norm": 0.3831497132778168, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1420 + }, + { + "epoch": 1.2660469234174414, + "grad_norm": 0.4633576273918152, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 1430 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.3690567910671234, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 1440 + }, + { + "epoch": 1.2837538733953076, + "grad_norm": 0.33980098366737366, + "learning_rate": 0.0002, + "loss": 1.767, + "step": 1450 + }, + { + "epoch": 1.2926073483842409, + "grad_norm": 0.3731277287006378, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1460 + }, + { + "epoch": 1.301460823373174, + "grad_norm": 0.3781551122665405, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 1470 + }, + { + "epoch": 1.310314298362107, + "grad_norm": 0.36511561274528503, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1480 + }, + { + "epoch": 1.3191677733510403, + "grad_norm": 0.3292245864868164, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1490 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 0.38758566975593567, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1500 + }, + { + "epoch": 1.3368747233289067, + "grad_norm": 0.3993414044380188, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1510 + }, + { + "epoch": 1.3457281983178397, + "grad_norm": 0.35689303278923035, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1520 + }, + { + "epoch": 1.354581673306773, + "grad_norm": 0.41849321126937866, + "learning_rate": 0.0002, + "loss": 1.7082, + "step": 1530 + }, + { + "epoch": 1.3634351482957061, + "grad_norm": 0.36752554774284363, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1540 + }, + { + "epoch": 1.3722886232846392, + "grad_norm": 0.36915940046310425, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 1550 + }, + { + "epoch": 1.3811420982735725, + "grad_norm": 0.3656710386276245, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1560 + }, + { + "epoch": 1.3899955732625056, + "grad_norm": 0.32055532932281494, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 1570 + }, + { + "epoch": 1.3988490482514386, + "grad_norm": 0.35031241178512573, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1580 + }, + { + "epoch": 1.407702523240372, + "grad_norm": 0.44541189074516296, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 1590 + }, + { + "epoch": 1.416555998229305, + "grad_norm": 0.36922356486320496, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 1600 + }, + { + "epoch": 1.425409473218238, + "grad_norm": 0.3470565974712372, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 1610 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.3743111193180084, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1620 + }, + { + "epoch": 1.4431164231961044, + "grad_norm": 0.3619250953197479, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1630 + }, + { + "epoch": 1.4519698981850375, + "grad_norm": 0.4028145968914032, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 1640 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.36065351963043213, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1650 + }, + { + "epoch": 1.469676848162904, + "grad_norm": 0.44304442405700684, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 1660 + }, + { + "epoch": 1.478530323151837, + "grad_norm": 0.35770007967948914, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1670 + }, + { + "epoch": 1.4873837981407703, + "grad_norm": 0.37584400177001953, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 1680 + }, + { + "epoch": 1.4962372731297033, + "grad_norm": 0.37151241302490234, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 1690 + }, + { + "epoch": 1.5050907481186364, + "grad_norm": 0.36422812938690186, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1700 + }, + { + "epoch": 1.5139442231075697, + "grad_norm": 0.3680015206336975, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1710 + }, + { + "epoch": 1.522797698096503, + "grad_norm": 0.3356926441192627, + "learning_rate": 0.0002, + "loss": 1.6917, + "step": 1720 + }, + { + "epoch": 1.531651173085436, + "grad_norm": 0.37887054681777954, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1730 + }, + { + "epoch": 1.5405046480743692, + "grad_norm": 0.37052762508392334, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1740 + }, + { + "epoch": 1.5493581230633025, + "grad_norm": 0.333925724029541, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 1750 + }, + { + "epoch": 1.5582115980522355, + "grad_norm": 0.3722778558731079, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1760 + }, + { + "epoch": 1.5670650730411686, + "grad_norm": 0.3331141173839569, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 1770 + }, + { + "epoch": 1.575918548030102, + "grad_norm": 0.3670045733451843, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1780 + }, + { + "epoch": 1.584772023019035, + "grad_norm": 0.3769885301589966, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1790 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.4266890287399292, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1800 + }, + { + "epoch": 1.6024789729969013, + "grad_norm": 0.37174347043037415, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1810 + }, + { + "epoch": 1.6113324479858344, + "grad_norm": 0.3599846363067627, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1820 + }, + { + "epoch": 1.6201859229747675, + "grad_norm": 0.3364820182323456, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1830 + }, + { + "epoch": 1.6290393979637008, + "grad_norm": 0.3874799907207489, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 1840 + }, + { + "epoch": 1.6378928729526339, + "grad_norm": 0.3706085681915283, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 1850 + }, + { + "epoch": 1.646746347941567, + "grad_norm": 0.3997809886932373, + "learning_rate": 0.0002, + "loss": 1.6761, + "step": 1860 + }, + { + "epoch": 1.6555998229305002, + "grad_norm": 0.4033166170120239, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 1870 + }, + { + "epoch": 1.6644532979194335, + "grad_norm": 0.3944370150566101, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1880 + }, + { + "epoch": 1.6733067729083664, + "grad_norm": 0.3467825651168823, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1890 + }, + { + "epoch": 1.6821602478972997, + "grad_norm": 0.35290950536727905, + "learning_rate": 0.0002, + "loss": 1.7462, + "step": 1900 + }, + { + "epoch": 1.691013722886233, + "grad_norm": 0.3664521872997284, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1910 + }, + { + "epoch": 1.699867197875166, + "grad_norm": 0.33863595128059387, + "learning_rate": 0.0002, + "loss": 1.7922, + "step": 1920 + }, + { + "epoch": 1.7087206728640991, + "grad_norm": 0.34726113080978394, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 1930 + }, + { + "epoch": 1.7175741478530324, + "grad_norm": 0.35060688853263855, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1940 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 0.33741647005081177, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1950 + }, + { + "epoch": 1.7352810978308986, + "grad_norm": 0.36190304160118103, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1960 + }, + { + "epoch": 1.7441345728198319, + "grad_norm": 0.3412845730781555, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 1970 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 0.3841935694217682, + "learning_rate": 0.0002, + "loss": 1.7038, + "step": 1980 + }, + { + "epoch": 1.761841522797698, + "grad_norm": 0.39062076807022095, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 1990 + }, + { + "epoch": 1.7706949977866313, + "grad_norm": 0.3741697669029236, + "learning_rate": 0.0002, + "loss": 1.7346, + "step": 2000 + }, + { + "epoch": 1.7795484727755644, + "grad_norm": 0.4160231053829193, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2010 + }, + { + "epoch": 1.7884019477644975, + "grad_norm": 0.3602111339569092, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 2020 + }, + { + "epoch": 1.7972554227534308, + "grad_norm": 0.36740878224372864, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2030 + }, + { + "epoch": 1.8061088977423638, + "grad_norm": 0.419039249420166, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 2040 + }, + { + "epoch": 1.814962372731297, + "grad_norm": 0.3511838912963867, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 2050 + }, + { + "epoch": 1.8238158477202302, + "grad_norm": 0.3580166697502136, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2060 + }, + { + "epoch": 1.8326693227091635, + "grad_norm": 0.40928223729133606, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 2070 + }, + { + "epoch": 1.8415227976980963, + "grad_norm": 0.37134310603141785, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 2080 + }, + { + "epoch": 1.8503762726870296, + "grad_norm": 0.3924112319946289, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2090 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.3215042054653168, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 2100 + }, + { + "epoch": 1.868083222664896, + "grad_norm": 0.37674015760421753, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2110 + }, + { + "epoch": 1.876936697653829, + "grad_norm": 0.370856374502182, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 2120 + }, + { + "epoch": 1.8857901726427624, + "grad_norm": 0.35783782601356506, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2130 + }, + { + "epoch": 1.8946436476316955, + "grad_norm": 0.39538058638572693, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 2140 + }, + { + "epoch": 1.9034971226206285, + "grad_norm": 0.36677780747413635, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 2150 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.39032700657844543, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 2160 + }, + { + "epoch": 1.921204072598495, + "grad_norm": 0.39762043952941895, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2170 + }, + { + "epoch": 1.930057547587428, + "grad_norm": 0.5400257110595703, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 2180 + }, + { + "epoch": 1.9389110225763613, + "grad_norm": 0.3650212287902832, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2190 + }, + { + "epoch": 1.9477644975652944, + "grad_norm": 0.3583165109157562, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.9566179725542274, + "grad_norm": 0.4031282365322113, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 2210 + }, + { + "epoch": 1.9654714475431607, + "grad_norm": 0.3673221170902252, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 2220 + }, + { + "epoch": 1.9743249225320938, + "grad_norm": 0.3920327126979828, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 2230 + }, + { + "epoch": 1.9831783975210269, + "grad_norm": 0.4765491783618927, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2240 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.38130584359169006, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2250 + }, + { + "epoch": 2.0, + "eval_loss": 1.8077166080474854, + "eval_runtime": 82.8351, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 0.785, + "step": 2259 + } + ], + "logging_steps": 10, + "max_steps": 9032, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.045414618941358e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8a6131fd7132c9ba3a02eee78517bc740df3c15a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e443fd1f1a7d77b019a6f79777397a200b1971631c7f14d74c3e3ff06e7ea63 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..64da1f24361ed625104db93f0ac7efd452dbf9d0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac9ed38f3fd7e686075060d1014bdd697714ee7867160ec907d75c8e68acac13 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dde75feb316d10e40be13707a2828f021151f11a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f960f983d469af723deb8a592cf9b2b80ba7d6f321a608a928ea313d9634e8fa +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f877789474998bb43ad849c64d8f212425bc59d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ac7788b0e96e0d795d63fadf4172b868908fdb70fe22555f021204512e387a +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9fc534d9a5a22ccd80e83393500dce13d4460228 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/trainer_state.json @@ -0,0 +1,2423 @@ +{ + "best_metric": 1.8077166080474854, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", + "epoch": 2.9995573262505535, + "eval_steps": 10, + "global_step": 3388, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008853474988933156, + "grad_norm": 0.4775333106517792, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 10 + }, + { + "epoch": 0.017706949977866312, + "grad_norm": 0.5485824346542358, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 20 + }, + { + "epoch": 0.02656042496679947, + "grad_norm": 0.5675218105316162, + "learning_rate": 0.0002, + "loss": 2.0984, + "step": 30 + }, + { + "epoch": 0.035413899955732624, + "grad_norm": 0.696494460105896, + "learning_rate": 0.0002, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.04426737494466578, + "grad_norm": 0.4788398742675781, + "learning_rate": 0.0002, + "loss": 1.9547, + "step": 50 + }, + { + "epoch": 0.05312084993359894, + "grad_norm": 0.4763128161430359, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 60 + }, + { + "epoch": 0.0619743249225321, + "grad_norm": 0.5929698348045349, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 70 + }, + { + "epoch": 0.07082779991146525, + "grad_norm": 0.5899396538734436, + "learning_rate": 0.0002, + "loss": 1.9573, + "step": 80 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.460123747587204, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 90 + }, + { + "epoch": 0.08853474988933156, + "grad_norm": 0.4184812009334564, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 100 + }, + { + "epoch": 0.09738822487826472, + "grad_norm": 0.4051891267299652, + "learning_rate": 0.0002, + "loss": 1.8079, + "step": 110 + }, + { + "epoch": 0.10624169986719788, + "grad_norm": 0.3709661066532135, + "learning_rate": 0.0002, + "loss": 1.8911, + "step": 120 + }, + { + "epoch": 0.11509517485613104, + "grad_norm": 0.4783487915992737, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 130 + }, + { + "epoch": 0.1239486498450642, + "grad_norm": 0.36478137969970703, + "learning_rate": 0.0002, + "loss": 1.8602, + "step": 140 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.4005294442176819, + "learning_rate": 0.0002, + "loss": 1.7814, + "step": 150 + }, + { + "epoch": 0.1416555998229305, + "grad_norm": 0.42357513308525085, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 160 + }, + { + "epoch": 0.15050907481186365, + "grad_norm": 0.3913971781730652, + "learning_rate": 0.0002, + "loss": 1.8835, + "step": 170 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.4650019407272339, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 180 + }, + { + "epoch": 0.16821602478972997, + "grad_norm": 0.5545958876609802, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 190 + }, + { + "epoch": 0.17706949977866313, + "grad_norm": 0.3669356107711792, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 200 + }, + { + "epoch": 0.18592297476759628, + "grad_norm": 0.3683622181415558, + "learning_rate": 0.0002, + "loss": 1.8169, + "step": 210 + }, + { + "epoch": 0.19477644975652944, + "grad_norm": 0.39825671911239624, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 220 + }, + { + "epoch": 0.2036299247454626, + "grad_norm": 0.4298318326473236, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 230 + }, + { + "epoch": 0.21248339973439576, + "grad_norm": 0.36111244559288025, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 240 + }, + { + "epoch": 0.2213368747233289, + "grad_norm": 0.3711858093738556, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 250 + }, + { + "epoch": 0.23019034971226207, + "grad_norm": 0.37717559933662415, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 260 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.3678877651691437, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 270 + }, + { + "epoch": 0.2478972996901284, + "grad_norm": 0.4165912866592407, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 280 + }, + { + "epoch": 0.25675077467906154, + "grad_norm": 0.3403240740299225, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 290 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.4023234248161316, + "learning_rate": 0.0002, + "loss": 1.8704, + "step": 300 + }, + { + "epoch": 0.27445772465692786, + "grad_norm": 0.32472360134124756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 310 + }, + { + "epoch": 0.283311199645861, + "grad_norm": 0.36464595794677734, + "learning_rate": 0.0002, + "loss": 1.8544, + "step": 320 + }, + { + "epoch": 0.2921646746347942, + "grad_norm": 0.3868598937988281, + "learning_rate": 0.0002, + "loss": 1.8168, + "step": 330 + }, + { + "epoch": 0.3010181496237273, + "grad_norm": 0.3123539686203003, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 340 + }, + { + "epoch": 0.3098716246126605, + "grad_norm": 0.3392639458179474, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 350 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.42070651054382324, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 360 + }, + { + "epoch": 0.3275785745905268, + "grad_norm": 0.3650900423526764, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 370 + }, + { + "epoch": 0.33643204957945994, + "grad_norm": 0.41388973593711853, + "learning_rate": 0.0002, + "loss": 1.8388, + "step": 380 + }, + { + "epoch": 0.3452855245683931, + "grad_norm": 0.36625272035598755, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 390 + }, + { + "epoch": 0.35413899955732625, + "grad_norm": 0.3930284082889557, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 400 + }, + { + "epoch": 0.3629924745462594, + "grad_norm": 0.3415820300579071, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 410 + }, + { + "epoch": 0.37184594953519257, + "grad_norm": 0.4256570041179657, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 420 + }, + { + "epoch": 0.3806994245241257, + "grad_norm": 0.3740842938423157, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 430 + }, + { + "epoch": 0.3895528995130589, + "grad_norm": 0.334108829498291, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 440 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.33186739683151245, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 450 + }, + { + "epoch": 0.4072598494909252, + "grad_norm": 0.39127954840660095, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 460 + }, + { + "epoch": 0.4161133244798583, + "grad_norm": 0.331443727016449, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 470 + }, + { + "epoch": 0.4249667994687915, + "grad_norm": 0.36834150552749634, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 480 + }, + { + "epoch": 0.43382027445772464, + "grad_norm": 0.338123619556427, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 490 + }, + { + "epoch": 0.4426737494466578, + "grad_norm": 0.3891060948371887, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 500 + }, + { + "epoch": 0.45152722443559096, + "grad_norm": 0.3486529290676117, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 510 + }, + { + "epoch": 0.46038069942452414, + "grad_norm": 0.3635135889053345, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 520 + }, + { + "epoch": 0.4692341744134573, + "grad_norm": 0.7706693410873413, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 530 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.33725443482398987, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 540 + }, + { + "epoch": 0.4869411243913236, + "grad_norm": 0.3127504289150238, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 550 + }, + { + "epoch": 0.4957945993802568, + "grad_norm": 0.3527977466583252, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 560 + }, + { + "epoch": 0.5046480743691899, + "grad_norm": 0.3574548661708832, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 570 + }, + { + "epoch": 0.5135015493581231, + "grad_norm": 0.32787248492240906, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 580 + }, + { + "epoch": 0.5223550243470563, + "grad_norm": 0.3309430778026581, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 590 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 0.34276407957077026, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 600 + }, + { + "epoch": 0.5400619743249225, + "grad_norm": 0.3343711495399475, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 610 + }, + { + "epoch": 0.5489154493138557, + "grad_norm": 0.3193040192127228, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 620 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.3059828579425812, + "learning_rate": 0.0002, + "loss": 1.7769, + "step": 630 + }, + { + "epoch": 0.566622399291722, + "grad_norm": 0.37237173318862915, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 640 + }, + { + "epoch": 0.5754758742806552, + "grad_norm": 0.36022549867630005, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 650 + }, + { + "epoch": 0.5843293492695883, + "grad_norm": 0.34974920749664307, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 660 + }, + { + "epoch": 0.5931828242585214, + "grad_norm": 0.37135401368141174, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.6020362992474546, + "grad_norm": 0.3385699689388275, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 680 + }, + { + "epoch": 0.6108897742363878, + "grad_norm": 0.36015814542770386, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 690 + }, + { + "epoch": 0.619743249225321, + "grad_norm": 0.3503795564174652, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 700 + }, + { + "epoch": 0.628596724214254, + "grad_norm": 0.3447190225124359, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 710 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.3193499445915222, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 720 + }, + { + "epoch": 0.6463036741921204, + "grad_norm": 0.37058180570602417, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 730 + }, + { + "epoch": 0.6551571491810536, + "grad_norm": 0.42216411232948303, + "learning_rate": 0.0002, + "loss": 1.8391, + "step": 740 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.3091185688972473, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 750 + }, + { + "epoch": 0.6728640991589199, + "grad_norm": 0.33168601989746094, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 760 + }, + { + "epoch": 0.6817175741478531, + "grad_norm": 0.31269341707229614, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 770 + }, + { + "epoch": 0.6905710491367862, + "grad_norm": 0.36125293374061584, + "learning_rate": 0.0002, + "loss": 1.8526, + "step": 780 + }, + { + "epoch": 0.6994245241257193, + "grad_norm": 0.3145293593406677, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 790 + }, + { + "epoch": 0.7082779991146525, + "grad_norm": 0.3611990809440613, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.3165971636772156, + "learning_rate": 0.0002, + "loss": 1.892, + "step": 810 + }, + { + "epoch": 0.7259849490925188, + "grad_norm": 0.3364323675632477, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 820 + }, + { + "epoch": 0.734838424081452, + "grad_norm": 0.4310600757598877, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 830 + }, + { + "epoch": 0.7436918990703851, + "grad_norm": 0.3414389491081238, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 840 + }, + { + "epoch": 0.7525453740593183, + "grad_norm": 0.35536202788352966, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 850 + }, + { + "epoch": 0.7613988490482514, + "grad_norm": 0.3232460618019104, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 860 + }, + { + "epoch": 0.7702523240371846, + "grad_norm": 0.32734858989715576, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 870 + }, + { + "epoch": 0.7791057990261178, + "grad_norm": 0.3433493673801422, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 880 + }, + { + "epoch": 0.787959274015051, + "grad_norm": 0.33354780077934265, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 890 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.30728545784950256, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 900 + }, + { + "epoch": 0.8056662239929172, + "grad_norm": 0.3373030126094818, + "learning_rate": 0.0002, + "loss": 1.8267, + "step": 910 + }, + { + "epoch": 0.8145196989818504, + "grad_norm": 0.3468782603740692, + "learning_rate": 0.0002, + "loss": 1.8479, + "step": 920 + }, + { + "epoch": 0.8233731739707836, + "grad_norm": 0.33520200848579407, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 930 + }, + { + "epoch": 0.8322266489597167, + "grad_norm": 0.35207098722457886, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 0.8410801239486498, + "grad_norm": 0.4000207483768463, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 950 + }, + { + "epoch": 0.849933598937583, + "grad_norm": 0.35362836718559265, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 960 + }, + { + "epoch": 0.8587870739265162, + "grad_norm": 0.3470745086669922, + "learning_rate": 0.0002, + "loss": 1.7497, + "step": 970 + }, + { + "epoch": 0.8676405489154493, + "grad_norm": 0.31602704524993896, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 980 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.3062942326068878, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 990 + }, + { + "epoch": 0.8853474988933157, + "grad_norm": 0.36963850259780884, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1000 + }, + { + "epoch": 0.8942009738822487, + "grad_norm": 0.3384034037590027, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1010 + }, + { + "epoch": 0.9030544488711819, + "grad_norm": 0.30436110496520996, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 1020 + }, + { + "epoch": 0.9119079238601151, + "grad_norm": 3.499784469604492, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1030 + }, + { + "epoch": 0.9207613988490483, + "grad_norm": 0.3130280375480652, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1040 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.29976674914360046, + "learning_rate": 0.0002, + "loss": 1.7527, + "step": 1050 + }, + { + "epoch": 0.9384683488269145, + "grad_norm": 0.35852617025375366, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 1060 + }, + { + "epoch": 0.9473218238158477, + "grad_norm": 0.3288591504096985, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1070 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.32641634345054626, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1080 + }, + { + "epoch": 0.965028773793714, + "grad_norm": 0.3305715322494507, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1090 + }, + { + "epoch": 0.9738822487826472, + "grad_norm": 0.30650773644447327, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1100 + }, + { + "epoch": 0.9827357237715804, + "grad_norm": 0.3330624997615814, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1110 + }, + { + "epoch": 0.9915891987605135, + "grad_norm": 0.3173314034938812, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 1120 + }, + { + "epoch": 0.9995573262505534, + "eval_loss": 1.8095673322677612, + "eval_runtime": 82.6312, + "eval_samples_per_second": 6.233, + "eval_steps_per_second": 0.787, + "step": 1129 + }, + { + "epoch": 1.0004426737494467, + "grad_norm": 0.3092995882034302, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1130 + }, + { + "epoch": 1.0092961487383798, + "grad_norm": 0.34386494755744934, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1140 + }, + { + "epoch": 1.0181496237273129, + "grad_norm": 0.2887897789478302, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 1150 + }, + { + "epoch": 1.0270030987162462, + "grad_norm": 0.3706893026828766, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1160 + }, + { + "epoch": 1.0358565737051793, + "grad_norm": 0.34724316000938416, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 1170 + }, + { + "epoch": 1.0447100486941125, + "grad_norm": 0.41001757979393005, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1180 + }, + { + "epoch": 1.0535635236830456, + "grad_norm": 0.34838348627090454, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 1190 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 0.37201181054115295, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1200 + }, + { + "epoch": 1.071270473660912, + "grad_norm": 0.36871352791786194, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1210 + }, + { + "epoch": 1.080123948649845, + "grad_norm": 0.35687458515167236, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 1220 + }, + { + "epoch": 1.0889774236387781, + "grad_norm": 0.3864741921424866, + "learning_rate": 0.0002, + "loss": 1.7235, + "step": 1230 + }, + { + "epoch": 1.0978308986277114, + "grad_norm": 0.3496808707714081, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1240 + }, + { + "epoch": 1.1066843736166445, + "grad_norm": 0.3444930911064148, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 1250 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.353188693523407, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1260 + }, + { + "epoch": 1.1243913235945109, + "grad_norm": 0.3284400999546051, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1270 + }, + { + "epoch": 1.133244798583444, + "grad_norm": 0.3545348644256592, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1280 + }, + { + "epoch": 1.1420982735723773, + "grad_norm": 0.3489900529384613, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1290 + }, + { + "epoch": 1.1509517485613103, + "grad_norm": 0.40355560183525085, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 1300 + }, + { + "epoch": 1.1598052235502434, + "grad_norm": 0.3369944095611572, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 1310 + }, + { + "epoch": 1.1686586985391767, + "grad_norm": 0.39141345024108887, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1320 + }, + { + "epoch": 1.1775121735281098, + "grad_norm": 0.36518552899360657, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 1330 + }, + { + "epoch": 1.1863656485170428, + "grad_norm": 0.3730056583881378, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1340 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 0.37711501121520996, + "learning_rate": 0.0002, + "loss": 1.7613, + "step": 1350 + }, + { + "epoch": 1.2040725984949092, + "grad_norm": 0.3627128005027771, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 1360 + }, + { + "epoch": 1.2129260734838425, + "grad_norm": 0.3458651006221771, + "learning_rate": 0.0002, + "loss": 1.7214, + "step": 1370 + }, + { + "epoch": 1.2217795484727756, + "grad_norm": 0.392395555973053, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1380 + }, + { + "epoch": 1.2306330234617087, + "grad_norm": 0.3353286683559418, + "learning_rate": 0.0002, + "loss": 1.7785, + "step": 1390 + }, + { + "epoch": 1.239486498450642, + "grad_norm": 0.9545007944107056, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 1400 + }, + { + "epoch": 1.248339973439575, + "grad_norm": 0.37037935853004456, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1410 + }, + { + "epoch": 1.257193448428508, + "grad_norm": 0.3831497132778168, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1420 + }, + { + "epoch": 1.2660469234174414, + "grad_norm": 0.4633576273918152, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 1430 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.3690567910671234, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 1440 + }, + { + "epoch": 1.2837538733953076, + "grad_norm": 0.33980098366737366, + "learning_rate": 0.0002, + "loss": 1.767, + "step": 1450 + }, + { + "epoch": 1.2926073483842409, + "grad_norm": 0.3731277287006378, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1460 + }, + { + "epoch": 1.301460823373174, + "grad_norm": 0.3781551122665405, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 1470 + }, + { + "epoch": 1.310314298362107, + "grad_norm": 0.36511561274528503, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1480 + }, + { + "epoch": 1.3191677733510403, + "grad_norm": 0.3292245864868164, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1490 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 0.38758566975593567, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1500 + }, + { + "epoch": 1.3368747233289067, + "grad_norm": 0.3993414044380188, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1510 + }, + { + "epoch": 1.3457281983178397, + "grad_norm": 0.35689303278923035, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1520 + }, + { + "epoch": 1.354581673306773, + "grad_norm": 0.41849321126937866, + "learning_rate": 0.0002, + "loss": 1.7082, + "step": 1530 + }, + { + "epoch": 1.3634351482957061, + "grad_norm": 0.36752554774284363, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1540 + }, + { + "epoch": 1.3722886232846392, + "grad_norm": 0.36915940046310425, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 1550 + }, + { + "epoch": 1.3811420982735725, + "grad_norm": 0.3656710386276245, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1560 + }, + { + "epoch": 1.3899955732625056, + "grad_norm": 0.32055532932281494, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 1570 + }, + { + "epoch": 1.3988490482514386, + "grad_norm": 0.35031241178512573, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1580 + }, + { + "epoch": 1.407702523240372, + "grad_norm": 0.44541189074516296, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 1590 + }, + { + "epoch": 1.416555998229305, + "grad_norm": 0.36922356486320496, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 1600 + }, + { + "epoch": 1.425409473218238, + "grad_norm": 0.3470565974712372, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 1610 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.3743111193180084, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1620 + }, + { + "epoch": 1.4431164231961044, + "grad_norm": 0.3619250953197479, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1630 + }, + { + "epoch": 1.4519698981850375, + "grad_norm": 0.4028145968914032, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 1640 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.36065351963043213, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1650 + }, + { + "epoch": 1.469676848162904, + "grad_norm": 0.44304442405700684, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 1660 + }, + { + "epoch": 1.478530323151837, + "grad_norm": 0.35770007967948914, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1670 + }, + { + "epoch": 1.4873837981407703, + "grad_norm": 0.37584400177001953, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 1680 + }, + { + "epoch": 1.4962372731297033, + "grad_norm": 0.37151241302490234, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 1690 + }, + { + "epoch": 1.5050907481186364, + "grad_norm": 0.36422812938690186, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1700 + }, + { + "epoch": 1.5139442231075697, + "grad_norm": 0.3680015206336975, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1710 + }, + { + "epoch": 1.522797698096503, + "grad_norm": 0.3356926441192627, + "learning_rate": 0.0002, + "loss": 1.6917, + "step": 1720 + }, + { + "epoch": 1.531651173085436, + "grad_norm": 0.37887054681777954, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1730 + }, + { + "epoch": 1.5405046480743692, + "grad_norm": 0.37052762508392334, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1740 + }, + { + "epoch": 1.5493581230633025, + "grad_norm": 0.333925724029541, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 1750 + }, + { + "epoch": 1.5582115980522355, + "grad_norm": 0.3722778558731079, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1760 + }, + { + "epoch": 1.5670650730411686, + "grad_norm": 0.3331141173839569, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 1770 + }, + { + "epoch": 1.575918548030102, + "grad_norm": 0.3670045733451843, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1780 + }, + { + "epoch": 1.584772023019035, + "grad_norm": 0.3769885301589966, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1790 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.4266890287399292, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1800 + }, + { + "epoch": 1.6024789729969013, + "grad_norm": 0.37174347043037415, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1810 + }, + { + "epoch": 1.6113324479858344, + "grad_norm": 0.3599846363067627, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1820 + }, + { + "epoch": 1.6201859229747675, + "grad_norm": 0.3364820182323456, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1830 + }, + { + "epoch": 1.6290393979637008, + "grad_norm": 0.3874799907207489, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 1840 + }, + { + "epoch": 1.6378928729526339, + "grad_norm": 0.3706085681915283, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 1850 + }, + { + "epoch": 1.646746347941567, + "grad_norm": 0.3997809886932373, + "learning_rate": 0.0002, + "loss": 1.6761, + "step": 1860 + }, + { + "epoch": 1.6555998229305002, + "grad_norm": 0.4033166170120239, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 1870 + }, + { + "epoch": 1.6644532979194335, + "grad_norm": 0.3944370150566101, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1880 + }, + { + "epoch": 1.6733067729083664, + "grad_norm": 0.3467825651168823, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1890 + }, + { + "epoch": 1.6821602478972997, + "grad_norm": 0.35290950536727905, + "learning_rate": 0.0002, + "loss": 1.7462, + "step": 1900 + }, + { + "epoch": 1.691013722886233, + "grad_norm": 0.3664521872997284, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1910 + }, + { + "epoch": 1.699867197875166, + "grad_norm": 0.33863595128059387, + "learning_rate": 0.0002, + "loss": 1.7922, + "step": 1920 + }, + { + "epoch": 1.7087206728640991, + "grad_norm": 0.34726113080978394, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 1930 + }, + { + "epoch": 1.7175741478530324, + "grad_norm": 0.35060688853263855, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1940 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 0.33741647005081177, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1950 + }, + { + "epoch": 1.7352810978308986, + "grad_norm": 0.36190304160118103, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1960 + }, + { + "epoch": 1.7441345728198319, + "grad_norm": 0.3412845730781555, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 1970 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 0.3841935694217682, + "learning_rate": 0.0002, + "loss": 1.7038, + "step": 1980 + }, + { + "epoch": 1.761841522797698, + "grad_norm": 0.39062076807022095, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 1990 + }, + { + "epoch": 1.7706949977866313, + "grad_norm": 0.3741697669029236, + "learning_rate": 0.0002, + "loss": 1.7346, + "step": 2000 + }, + { + "epoch": 1.7795484727755644, + "grad_norm": 0.4160231053829193, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2010 + }, + { + "epoch": 1.7884019477644975, + "grad_norm": 0.3602111339569092, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 2020 + }, + { + "epoch": 1.7972554227534308, + "grad_norm": 0.36740878224372864, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2030 + }, + { + "epoch": 1.8061088977423638, + "grad_norm": 0.419039249420166, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 2040 + }, + { + "epoch": 1.814962372731297, + "grad_norm": 0.3511838912963867, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 2050 + }, + { + "epoch": 1.8238158477202302, + "grad_norm": 0.3580166697502136, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2060 + }, + { + "epoch": 1.8326693227091635, + "grad_norm": 0.40928223729133606, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 2070 + }, + { + "epoch": 1.8415227976980963, + "grad_norm": 0.37134310603141785, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 2080 + }, + { + "epoch": 1.8503762726870296, + "grad_norm": 0.3924112319946289, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2090 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.3215042054653168, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 2100 + }, + { + "epoch": 1.868083222664896, + "grad_norm": 0.37674015760421753, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2110 + }, + { + "epoch": 1.876936697653829, + "grad_norm": 0.370856374502182, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 2120 + }, + { + "epoch": 1.8857901726427624, + "grad_norm": 0.35783782601356506, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2130 + }, + { + "epoch": 1.8946436476316955, + "grad_norm": 0.39538058638572693, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 2140 + }, + { + "epoch": 1.9034971226206285, + "grad_norm": 0.36677780747413635, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 2150 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.39032700657844543, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 2160 + }, + { + "epoch": 1.921204072598495, + "grad_norm": 0.39762043952941895, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2170 + }, + { + "epoch": 1.930057547587428, + "grad_norm": 0.5400257110595703, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 2180 + }, + { + "epoch": 1.9389110225763613, + "grad_norm": 0.3650212287902832, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2190 + }, + { + "epoch": 1.9477644975652944, + "grad_norm": 0.3583165109157562, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.9566179725542274, + "grad_norm": 0.4031282365322113, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 2210 + }, + { + "epoch": 1.9654714475431607, + "grad_norm": 0.3673221170902252, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 2220 + }, + { + "epoch": 1.9743249225320938, + "grad_norm": 0.3920327126979828, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 2230 + }, + { + "epoch": 1.9831783975210269, + "grad_norm": 0.4765491783618927, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2240 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.38130584359169006, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2250 + }, + { + "epoch": 2.0, + "eval_loss": 1.8077166080474854, + "eval_runtime": 82.8351, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 0.785, + "step": 2259 + }, + { + "epoch": 2.0008853474988935, + "grad_norm": 0.34340235590934753, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 2260 + }, + { + "epoch": 2.0097388224878263, + "grad_norm": 0.3710762858390808, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2270 + }, + { + "epoch": 2.0185922974767596, + "grad_norm": 0.35640114545822144, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2280 + }, + { + "epoch": 2.027445772465693, + "grad_norm": 0.45970189571380615, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 2290 + }, + { + "epoch": 2.0362992474546258, + "grad_norm": 0.4256797134876251, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 2300 + }, + { + "epoch": 2.045152722443559, + "grad_norm": 0.42421531677246094, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2310 + }, + { + "epoch": 2.0540061974324924, + "grad_norm": 0.4032478928565979, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2320 + }, + { + "epoch": 2.062859672421425, + "grad_norm": 0.4073623716831207, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2330 + }, + { + "epoch": 2.0717131474103585, + "grad_norm": 0.4845200777053833, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2340 + }, + { + "epoch": 2.080566622399292, + "grad_norm": 0.40578293800354004, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2350 + }, + { + "epoch": 2.089420097388225, + "grad_norm": 0.4037284255027771, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2360 + }, + { + "epoch": 2.098273572377158, + "grad_norm": 0.4717613160610199, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2370 + }, + { + "epoch": 2.1071270473660912, + "grad_norm": 0.42076411843299866, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 2380 + }, + { + "epoch": 2.1159805223550245, + "grad_norm": 0.47799113392829895, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 2390 + }, + { + "epoch": 2.1248339973439574, + "grad_norm": 0.4253084063529968, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 2400 + }, + { + "epoch": 2.1336874723328907, + "grad_norm": 0.5023085474967957, + "learning_rate": 0.0002, + "loss": 1.6432, + "step": 2410 + }, + { + "epoch": 2.142540947321824, + "grad_norm": 0.49162712693214417, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2420 + }, + { + "epoch": 2.151394422310757, + "grad_norm": 0.39035019278526306, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2430 + }, + { + "epoch": 2.16024789729969, + "grad_norm": 0.43223854899406433, + "learning_rate": 0.0002, + "loss": 1.7526, + "step": 2440 + }, + { + "epoch": 2.1691013722886234, + "grad_norm": 0.4596616327762604, + "learning_rate": 0.0002, + "loss": 1.6334, + "step": 2450 + }, + { + "epoch": 2.1779548472775563, + "grad_norm": 0.4469447731971741, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 2460 + }, + { + "epoch": 2.1868083222664896, + "grad_norm": 0.5100595355033875, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 2470 + }, + { + "epoch": 2.195661797255423, + "grad_norm": 0.4169430732727051, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2480 + }, + { + "epoch": 2.2045152722443557, + "grad_norm": 0.4699254035949707, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 2490 + }, + { + "epoch": 2.213368747233289, + "grad_norm": 0.43524250388145447, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 2500 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.4496648907661438, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2510 + }, + { + "epoch": 2.231075697211155, + "grad_norm": 0.43408212065696716, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2520 + }, + { + "epoch": 2.2399291722000885, + "grad_norm": 0.4596034288406372, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2530 + }, + { + "epoch": 2.2487826471890218, + "grad_norm": 0.5217021107673645, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2540 + }, + { + "epoch": 2.2576361221779546, + "grad_norm": 0.44745638966560364, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 2550 + }, + { + "epoch": 2.266489597166888, + "grad_norm": 0.4484798014163971, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 2560 + }, + { + "epoch": 2.275343072155821, + "grad_norm": 0.4428067207336426, + "learning_rate": 0.0002, + "loss": 1.5321, + "step": 2570 + }, + { + "epoch": 2.2841965471447545, + "grad_norm": 0.5095171332359314, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2580 + }, + { + "epoch": 2.2930500221336874, + "grad_norm": 0.44833096861839294, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 2590 + }, + { + "epoch": 2.3019034971226207, + "grad_norm": 0.507905900478363, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 2600 + }, + { + "epoch": 2.310756972111554, + "grad_norm": 0.40808171033859253, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2610 + }, + { + "epoch": 2.319610447100487, + "grad_norm": 0.4684814214706421, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2620 + }, + { + "epoch": 2.32846392208942, + "grad_norm": 0.44864922761917114, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2630 + }, + { + "epoch": 2.3373173970783534, + "grad_norm": 0.4174162745475769, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2640 + }, + { + "epoch": 2.3461708720672863, + "grad_norm": 0.42314743995666504, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 2650 + }, + { + "epoch": 2.3550243470562195, + "grad_norm": 0.49224185943603516, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2660 + }, + { + "epoch": 2.363877822045153, + "grad_norm": 0.45190292596817017, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 2670 + }, + { + "epoch": 2.3727312970340857, + "grad_norm": 0.41817107796669006, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 2680 + }, + { + "epoch": 2.381584772023019, + "grad_norm": 0.6436763405799866, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2690 + }, + { + "epoch": 2.3904382470119523, + "grad_norm": 0.47175949811935425, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2700 + }, + { + "epoch": 2.3992917220008856, + "grad_norm": 0.480339378118515, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 2710 + }, + { + "epoch": 2.4081451969898184, + "grad_norm": 0.4723486006259918, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 2720 + }, + { + "epoch": 2.4169986719787517, + "grad_norm": 0.4305492043495178, + "learning_rate": 0.0002, + "loss": 1.54, + "step": 2730 + }, + { + "epoch": 2.425852146967685, + "grad_norm": 0.5007492303848267, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2740 + }, + { + "epoch": 2.434705621956618, + "grad_norm": 0.5374062061309814, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 2750 + }, + { + "epoch": 2.443559096945551, + "grad_norm": 0.45866212248802185, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 2760 + }, + { + "epoch": 2.4524125719344845, + "grad_norm": 0.47914502024650574, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 2770 + }, + { + "epoch": 2.4612660469234173, + "grad_norm": 0.43804746866226196, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 2780 + }, + { + "epoch": 2.4701195219123506, + "grad_norm": 0.43656906485557556, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 2790 + }, + { + "epoch": 2.478972996901284, + "grad_norm": 0.4820363521575928, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 2800 + }, + { + "epoch": 2.4878264718902168, + "grad_norm": 0.4916800558567047, + "learning_rate": 0.0002, + "loss": 1.5375, + "step": 2810 + }, + { + "epoch": 2.49667994687915, + "grad_norm": 0.4521256983280182, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2820 + }, + { + "epoch": 2.5055334218680834, + "grad_norm": 0.5066806674003601, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 2830 + }, + { + "epoch": 2.514386896857016, + "grad_norm": 0.4768151640892029, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2840 + }, + { + "epoch": 2.5232403718459495, + "grad_norm": 0.5144683718681335, + "learning_rate": 0.0002, + "loss": 1.6719, + "step": 2850 + }, + { + "epoch": 2.532093846834883, + "grad_norm": 0.4718942940235138, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2860 + }, + { + "epoch": 2.5409473218238157, + "grad_norm": 0.4924587309360504, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2870 + }, + { + "epoch": 2.549800796812749, + "grad_norm": 0.4649953842163086, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2880 + }, + { + "epoch": 2.5586542718016823, + "grad_norm": 0.4836665987968445, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2890 + }, + { + "epoch": 2.567507746790615, + "grad_norm": 0.4162124991416931, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 2900 + }, + { + "epoch": 2.5763612217795484, + "grad_norm": 0.4894537925720215, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2910 + }, + { + "epoch": 2.5852146967684817, + "grad_norm": 0.4539397358894348, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 2920 + }, + { + "epoch": 2.5940681717574146, + "grad_norm": 0.4718773066997528, + "learning_rate": 0.0002, + "loss": 1.6449, + "step": 2930 + }, + { + "epoch": 2.602921646746348, + "grad_norm": 0.49989837408065796, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 2940 + }, + { + "epoch": 2.611775121735281, + "grad_norm": 0.4862406849861145, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2950 + }, + { + "epoch": 2.620628596724214, + "grad_norm": 0.4244804382324219, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 2960 + }, + { + "epoch": 2.6294820717131473, + "grad_norm": 0.49304354190826416, + "learning_rate": 0.0002, + "loss": 1.7795, + "step": 2970 + }, + { + "epoch": 2.6383355467020806, + "grad_norm": 0.4818236529827118, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 2980 + }, + { + "epoch": 2.647189021691014, + "grad_norm": 0.5077425837516785, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 2990 + }, + { + "epoch": 2.6560424966799467, + "grad_norm": 0.4494157135486603, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 3000 + }, + { + "epoch": 2.66489597166888, + "grad_norm": 0.4790278971195221, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 3010 + }, + { + "epoch": 2.6737494466578133, + "grad_norm": 0.4702624976634979, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3020 + }, + { + "epoch": 2.682602921646746, + "grad_norm": 0.5082133412361145, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3030 + }, + { + "epoch": 2.6914563966356795, + "grad_norm": 0.4553256630897522, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 3040 + }, + { + "epoch": 2.700309871624613, + "grad_norm": 0.4492715001106262, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3050 + }, + { + "epoch": 2.709163346613546, + "grad_norm": 0.4555944502353668, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 3060 + }, + { + "epoch": 2.718016821602479, + "grad_norm": 0.5879693031311035, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3070 + }, + { + "epoch": 2.7268702965914122, + "grad_norm": 0.4628562927246094, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3080 + }, + { + "epoch": 2.7357237715803455, + "grad_norm": 0.5169575810432434, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 3090 + }, + { + "epoch": 2.7445772465692784, + "grad_norm": 0.4630090892314911, + "learning_rate": 0.0002, + "loss": 1.562, + "step": 3100 + }, + { + "epoch": 2.7534307215582117, + "grad_norm": 0.5437219738960266, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 3110 + }, + { + "epoch": 2.762284196547145, + "grad_norm": 0.5102152228355408, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 3120 + }, + { + "epoch": 2.771137671536078, + "grad_norm": 0.48287826776504517, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3130 + }, + { + "epoch": 2.779991146525011, + "grad_norm": 0.4671737253665924, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 3140 + }, + { + "epoch": 2.7888446215139444, + "grad_norm": 0.5177035331726074, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3150 + }, + { + "epoch": 2.7976980965028773, + "grad_norm": 0.450989305973053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 3160 + }, + { + "epoch": 2.8065515714918106, + "grad_norm": 0.45007848739624023, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3170 + }, + { + "epoch": 2.815405046480744, + "grad_norm": 0.4600294530391693, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3180 + }, + { + "epoch": 2.8242585214696767, + "grad_norm": 0.485628604888916, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 3190 + }, + { + "epoch": 2.83311199645861, + "grad_norm": 0.49811574816703796, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 3200 + }, + { + "epoch": 2.8419654714475433, + "grad_norm": 0.5012516975402832, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 3210 + }, + { + "epoch": 2.850818946436476, + "grad_norm": 0.4552757740020752, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 3220 + }, + { + "epoch": 2.8596724214254094, + "grad_norm": 0.4539635479450226, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 3230 + }, + { + "epoch": 2.8685258964143427, + "grad_norm": 0.5534685850143433, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 3240 + }, + { + "epoch": 2.8773793714032756, + "grad_norm": 0.4570811688899994, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3250 + }, + { + "epoch": 2.886232846392209, + "grad_norm": 0.48181653022766113, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3260 + }, + { + "epoch": 2.895086321381142, + "grad_norm": 0.4871032238006592, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 3270 + }, + { + "epoch": 2.903939796370075, + "grad_norm": 0.4643239676952362, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3280 + }, + { + "epoch": 2.9127932713590083, + "grad_norm": 0.5024484395980835, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3290 + }, + { + "epoch": 2.9216467463479416, + "grad_norm": 0.4425384998321533, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 3300 + }, + { + "epoch": 2.9305002213368745, + "grad_norm": 0.459168016910553, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 3310 + }, + { + "epoch": 2.939353696325808, + "grad_norm": 0.4950717091560364, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 3320 + }, + { + "epoch": 2.948207171314741, + "grad_norm": 0.4516230523586273, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3330 + }, + { + "epoch": 2.957060646303674, + "grad_norm": 0.49523285031318665, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3340 + }, + { + "epoch": 2.9659141212926072, + "grad_norm": 0.49282631278038025, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 3350 + }, + { + "epoch": 2.9747675962815405, + "grad_norm": 0.45825016498565674, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 3360 + }, + { + "epoch": 2.983621071270474, + "grad_norm": 0.4952891170978546, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 3370 + }, + { + "epoch": 2.9924745462594067, + "grad_norm": 0.42182639241218567, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3380 + }, + { + "epoch": 2.9995573262505535, + "eval_loss": 1.8308420181274414, + "eval_runtime": 82.786, + "eval_samples_per_second": 6.221, + "eval_steps_per_second": 0.785, + "step": 3388 + } + ], + "logging_steps": 10, + "max_steps": 9032, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.568121928412037e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-3388/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d0034d500c0a7552c64d5fbd1311197cfe2df0b1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3afd2eaaf8bacbb4903b9b67cd4f42cf26e77e095d7dd43f2e437450371cf41d +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4c8874533130e56aa1f976ae77c034233b57068 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6208f86e33e21c746139ddda18f67208717e0cb813bef03e8153754697036c95 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f5c109a9e74141dfda86b8b1db50d7aa83dc7d6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:732211ce39743764df47957bfb7dfe060ca8265c3e5ad61941aa086eaf495421 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f643a586aff090099d26b844dd7d9aa7f3a7ce74 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee35733c3f8c3816d378885349e1298d1bc9242d983080dfa72fe5bd0840be76 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..53977a4de46f0b1484fdd99bd9d68101603fc475 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/trainer_state.json @@ -0,0 +1,3222 @@ +{ + "best_metric": 1.8077166080474854, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 4518, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008853474988933156, + "grad_norm": 0.4775333106517792, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 10 + }, + { + "epoch": 0.017706949977866312, + "grad_norm": 0.5485824346542358, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 20 + }, + { + "epoch": 0.02656042496679947, + "grad_norm": 0.5675218105316162, + "learning_rate": 0.0002, + "loss": 2.0984, + "step": 30 + }, + { + "epoch": 0.035413899955732624, + "grad_norm": 0.696494460105896, + "learning_rate": 0.0002, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.04426737494466578, + "grad_norm": 0.4788398742675781, + "learning_rate": 0.0002, + "loss": 1.9547, + "step": 50 + }, + { + "epoch": 0.05312084993359894, + "grad_norm": 0.4763128161430359, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 60 + }, + { + "epoch": 0.0619743249225321, + "grad_norm": 0.5929698348045349, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 70 + }, + { + "epoch": 0.07082779991146525, + "grad_norm": 0.5899396538734436, + "learning_rate": 0.0002, + "loss": 1.9573, + "step": 80 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.460123747587204, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 90 + }, + { + "epoch": 0.08853474988933156, + "grad_norm": 0.4184812009334564, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 100 + }, + { + "epoch": 0.09738822487826472, + "grad_norm": 0.4051891267299652, + "learning_rate": 0.0002, + "loss": 1.8079, + "step": 110 + }, + { + "epoch": 0.10624169986719788, + "grad_norm": 0.3709661066532135, + "learning_rate": 0.0002, + "loss": 1.8911, + "step": 120 + }, + { + "epoch": 0.11509517485613104, + "grad_norm": 0.4783487915992737, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 130 + }, + { + "epoch": 0.1239486498450642, + "grad_norm": 0.36478137969970703, + "learning_rate": 0.0002, + "loss": 1.8602, + "step": 140 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.4005294442176819, + "learning_rate": 0.0002, + "loss": 1.7814, + "step": 150 + }, + { + "epoch": 0.1416555998229305, + "grad_norm": 0.42357513308525085, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 160 + }, + { + "epoch": 0.15050907481186365, + "grad_norm": 0.3913971781730652, + "learning_rate": 0.0002, + "loss": 1.8835, + "step": 170 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.4650019407272339, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 180 + }, + { + "epoch": 0.16821602478972997, + "grad_norm": 0.5545958876609802, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 190 + }, + { + "epoch": 0.17706949977866313, + "grad_norm": 0.3669356107711792, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 200 + }, + { + "epoch": 0.18592297476759628, + "grad_norm": 0.3683622181415558, + "learning_rate": 0.0002, + "loss": 1.8169, + "step": 210 + }, + { + "epoch": 0.19477644975652944, + "grad_norm": 0.39825671911239624, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 220 + }, + { + "epoch": 0.2036299247454626, + "grad_norm": 0.4298318326473236, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 230 + }, + { + "epoch": 0.21248339973439576, + "grad_norm": 0.36111244559288025, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 240 + }, + { + "epoch": 0.2213368747233289, + "grad_norm": 0.3711858093738556, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 250 + }, + { + "epoch": 0.23019034971226207, + "grad_norm": 0.37717559933662415, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 260 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.3678877651691437, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 270 + }, + { + "epoch": 0.2478972996901284, + "grad_norm": 0.4165912866592407, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 280 + }, + { + "epoch": 0.25675077467906154, + "grad_norm": 0.3403240740299225, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 290 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.4023234248161316, + "learning_rate": 0.0002, + "loss": 1.8704, + "step": 300 + }, + { + "epoch": 0.27445772465692786, + "grad_norm": 0.32472360134124756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 310 + }, + { + "epoch": 0.283311199645861, + "grad_norm": 0.36464595794677734, + "learning_rate": 0.0002, + "loss": 1.8544, + "step": 320 + }, + { + "epoch": 0.2921646746347942, + "grad_norm": 0.3868598937988281, + "learning_rate": 0.0002, + "loss": 1.8168, + "step": 330 + }, + { + "epoch": 0.3010181496237273, + "grad_norm": 0.3123539686203003, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 340 + }, + { + "epoch": 0.3098716246126605, + "grad_norm": 0.3392639458179474, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 350 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.42070651054382324, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 360 + }, + { + "epoch": 0.3275785745905268, + "grad_norm": 0.3650900423526764, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 370 + }, + { + "epoch": 0.33643204957945994, + "grad_norm": 0.41388973593711853, + "learning_rate": 0.0002, + "loss": 1.8388, + "step": 380 + }, + { + "epoch": 0.3452855245683931, + "grad_norm": 0.36625272035598755, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 390 + }, + { + "epoch": 0.35413899955732625, + "grad_norm": 0.3930284082889557, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 400 + }, + { + "epoch": 0.3629924745462594, + "grad_norm": 0.3415820300579071, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 410 + }, + { + "epoch": 0.37184594953519257, + "grad_norm": 0.4256570041179657, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 420 + }, + { + "epoch": 0.3806994245241257, + "grad_norm": 0.3740842938423157, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 430 + }, + { + "epoch": 0.3895528995130589, + "grad_norm": 0.334108829498291, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 440 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.33186739683151245, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 450 + }, + { + "epoch": 0.4072598494909252, + "grad_norm": 0.39127954840660095, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 460 + }, + { + "epoch": 0.4161133244798583, + "grad_norm": 0.331443727016449, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 470 + }, + { + "epoch": 0.4249667994687915, + "grad_norm": 0.36834150552749634, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 480 + }, + { + "epoch": 0.43382027445772464, + "grad_norm": 0.338123619556427, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 490 + }, + { + "epoch": 0.4426737494466578, + "grad_norm": 0.3891060948371887, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 500 + }, + { + "epoch": 0.45152722443559096, + "grad_norm": 0.3486529290676117, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 510 + }, + { + "epoch": 0.46038069942452414, + "grad_norm": 0.3635135889053345, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 520 + }, + { + "epoch": 0.4692341744134573, + "grad_norm": 0.7706693410873413, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 530 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.33725443482398987, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 540 + }, + { + "epoch": 0.4869411243913236, + "grad_norm": 0.3127504289150238, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 550 + }, + { + "epoch": 0.4957945993802568, + "grad_norm": 0.3527977466583252, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 560 + }, + { + "epoch": 0.5046480743691899, + "grad_norm": 0.3574548661708832, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 570 + }, + { + "epoch": 0.5135015493581231, + "grad_norm": 0.32787248492240906, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 580 + }, + { + "epoch": 0.5223550243470563, + "grad_norm": 0.3309430778026581, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 590 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 0.34276407957077026, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 600 + }, + { + "epoch": 0.5400619743249225, + "grad_norm": 0.3343711495399475, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 610 + }, + { + "epoch": 0.5489154493138557, + "grad_norm": 0.3193040192127228, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 620 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.3059828579425812, + "learning_rate": 0.0002, + "loss": 1.7769, + "step": 630 + }, + { + "epoch": 0.566622399291722, + "grad_norm": 0.37237173318862915, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 640 + }, + { + "epoch": 0.5754758742806552, + "grad_norm": 0.36022549867630005, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 650 + }, + { + "epoch": 0.5843293492695883, + "grad_norm": 0.34974920749664307, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 660 + }, + { + "epoch": 0.5931828242585214, + "grad_norm": 0.37135401368141174, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.6020362992474546, + "grad_norm": 0.3385699689388275, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 680 + }, + { + "epoch": 0.6108897742363878, + "grad_norm": 0.36015814542770386, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 690 + }, + { + "epoch": 0.619743249225321, + "grad_norm": 0.3503795564174652, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 700 + }, + { + "epoch": 0.628596724214254, + "grad_norm": 0.3447190225124359, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 710 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.3193499445915222, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 720 + }, + { + "epoch": 0.6463036741921204, + "grad_norm": 0.37058180570602417, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 730 + }, + { + "epoch": 0.6551571491810536, + "grad_norm": 0.42216411232948303, + "learning_rate": 0.0002, + "loss": 1.8391, + "step": 740 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.3091185688972473, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 750 + }, + { + "epoch": 0.6728640991589199, + "grad_norm": 0.33168601989746094, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 760 + }, + { + "epoch": 0.6817175741478531, + "grad_norm": 0.31269341707229614, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 770 + }, + { + "epoch": 0.6905710491367862, + "grad_norm": 0.36125293374061584, + "learning_rate": 0.0002, + "loss": 1.8526, + "step": 780 + }, + { + "epoch": 0.6994245241257193, + "grad_norm": 0.3145293593406677, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 790 + }, + { + "epoch": 0.7082779991146525, + "grad_norm": 0.3611990809440613, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.3165971636772156, + "learning_rate": 0.0002, + "loss": 1.892, + "step": 810 + }, + { + "epoch": 0.7259849490925188, + "grad_norm": 0.3364323675632477, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 820 + }, + { + "epoch": 0.734838424081452, + "grad_norm": 0.4310600757598877, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 830 + }, + { + "epoch": 0.7436918990703851, + "grad_norm": 0.3414389491081238, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 840 + }, + { + "epoch": 0.7525453740593183, + "grad_norm": 0.35536202788352966, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 850 + }, + { + "epoch": 0.7613988490482514, + "grad_norm": 0.3232460618019104, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 860 + }, + { + "epoch": 0.7702523240371846, + "grad_norm": 0.32734858989715576, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 870 + }, + { + "epoch": 0.7791057990261178, + "grad_norm": 0.3433493673801422, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 880 + }, + { + "epoch": 0.787959274015051, + "grad_norm": 0.33354780077934265, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 890 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.30728545784950256, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 900 + }, + { + "epoch": 0.8056662239929172, + "grad_norm": 0.3373030126094818, + "learning_rate": 0.0002, + "loss": 1.8267, + "step": 910 + }, + { + "epoch": 0.8145196989818504, + "grad_norm": 0.3468782603740692, + "learning_rate": 0.0002, + "loss": 1.8479, + "step": 920 + }, + { + "epoch": 0.8233731739707836, + "grad_norm": 0.33520200848579407, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 930 + }, + { + "epoch": 0.8322266489597167, + "grad_norm": 0.35207098722457886, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 0.8410801239486498, + "grad_norm": 0.4000207483768463, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 950 + }, + { + "epoch": 0.849933598937583, + "grad_norm": 0.35362836718559265, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 960 + }, + { + "epoch": 0.8587870739265162, + "grad_norm": 0.3470745086669922, + "learning_rate": 0.0002, + "loss": 1.7497, + "step": 970 + }, + { + "epoch": 0.8676405489154493, + "grad_norm": 0.31602704524993896, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 980 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.3062942326068878, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 990 + }, + { + "epoch": 0.8853474988933157, + "grad_norm": 0.36963850259780884, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1000 + }, + { + "epoch": 0.8942009738822487, + "grad_norm": 0.3384034037590027, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1010 + }, + { + "epoch": 0.9030544488711819, + "grad_norm": 0.30436110496520996, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 1020 + }, + { + "epoch": 0.9119079238601151, + "grad_norm": 3.499784469604492, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1030 + }, + { + "epoch": 0.9207613988490483, + "grad_norm": 0.3130280375480652, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1040 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.29976674914360046, + "learning_rate": 0.0002, + "loss": 1.7527, + "step": 1050 + }, + { + "epoch": 0.9384683488269145, + "grad_norm": 0.35852617025375366, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 1060 + }, + { + "epoch": 0.9473218238158477, + "grad_norm": 0.3288591504096985, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1070 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.32641634345054626, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1080 + }, + { + "epoch": 0.965028773793714, + "grad_norm": 0.3305715322494507, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1090 + }, + { + "epoch": 0.9738822487826472, + "grad_norm": 0.30650773644447327, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1100 + }, + { + "epoch": 0.9827357237715804, + "grad_norm": 0.3330624997615814, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1110 + }, + { + "epoch": 0.9915891987605135, + "grad_norm": 0.3173314034938812, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 1120 + }, + { + "epoch": 0.9995573262505534, + "eval_loss": 1.8095673322677612, + "eval_runtime": 82.6312, + "eval_samples_per_second": 6.233, + "eval_steps_per_second": 0.787, + "step": 1129 + }, + { + "epoch": 1.0004426737494467, + "grad_norm": 0.3092995882034302, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1130 + }, + { + "epoch": 1.0092961487383798, + "grad_norm": 0.34386494755744934, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1140 + }, + { + "epoch": 1.0181496237273129, + "grad_norm": 0.2887897789478302, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 1150 + }, + { + "epoch": 1.0270030987162462, + "grad_norm": 0.3706893026828766, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1160 + }, + { + "epoch": 1.0358565737051793, + "grad_norm": 0.34724316000938416, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 1170 + }, + { + "epoch": 1.0447100486941125, + "grad_norm": 0.41001757979393005, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1180 + }, + { + "epoch": 1.0535635236830456, + "grad_norm": 0.34838348627090454, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 1190 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 0.37201181054115295, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1200 + }, + { + "epoch": 1.071270473660912, + "grad_norm": 0.36871352791786194, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1210 + }, + { + "epoch": 1.080123948649845, + "grad_norm": 0.35687458515167236, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 1220 + }, + { + "epoch": 1.0889774236387781, + "grad_norm": 0.3864741921424866, + "learning_rate": 0.0002, + "loss": 1.7235, + "step": 1230 + }, + { + "epoch": 1.0978308986277114, + "grad_norm": 0.3496808707714081, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1240 + }, + { + "epoch": 1.1066843736166445, + "grad_norm": 0.3444930911064148, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 1250 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.353188693523407, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1260 + }, + { + "epoch": 1.1243913235945109, + "grad_norm": 0.3284400999546051, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1270 + }, + { + "epoch": 1.133244798583444, + "grad_norm": 0.3545348644256592, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1280 + }, + { + "epoch": 1.1420982735723773, + "grad_norm": 0.3489900529384613, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1290 + }, + { + "epoch": 1.1509517485613103, + "grad_norm": 0.40355560183525085, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 1300 + }, + { + "epoch": 1.1598052235502434, + "grad_norm": 0.3369944095611572, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 1310 + }, + { + "epoch": 1.1686586985391767, + "grad_norm": 0.39141345024108887, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1320 + }, + { + "epoch": 1.1775121735281098, + "grad_norm": 0.36518552899360657, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 1330 + }, + { + "epoch": 1.1863656485170428, + "grad_norm": 0.3730056583881378, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1340 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 0.37711501121520996, + "learning_rate": 0.0002, + "loss": 1.7613, + "step": 1350 + }, + { + "epoch": 1.2040725984949092, + "grad_norm": 0.3627128005027771, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 1360 + }, + { + "epoch": 1.2129260734838425, + "grad_norm": 0.3458651006221771, + "learning_rate": 0.0002, + "loss": 1.7214, + "step": 1370 + }, + { + "epoch": 1.2217795484727756, + "grad_norm": 0.392395555973053, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1380 + }, + { + "epoch": 1.2306330234617087, + "grad_norm": 0.3353286683559418, + "learning_rate": 0.0002, + "loss": 1.7785, + "step": 1390 + }, + { + "epoch": 1.239486498450642, + "grad_norm": 0.9545007944107056, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 1400 + }, + { + "epoch": 1.248339973439575, + "grad_norm": 0.37037935853004456, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1410 + }, + { + "epoch": 1.257193448428508, + "grad_norm": 0.3831497132778168, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1420 + }, + { + "epoch": 1.2660469234174414, + "grad_norm": 0.4633576273918152, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 1430 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.3690567910671234, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 1440 + }, + { + "epoch": 1.2837538733953076, + "grad_norm": 0.33980098366737366, + "learning_rate": 0.0002, + "loss": 1.767, + "step": 1450 + }, + { + "epoch": 1.2926073483842409, + "grad_norm": 0.3731277287006378, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1460 + }, + { + "epoch": 1.301460823373174, + "grad_norm": 0.3781551122665405, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 1470 + }, + { + "epoch": 1.310314298362107, + "grad_norm": 0.36511561274528503, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1480 + }, + { + "epoch": 1.3191677733510403, + "grad_norm": 0.3292245864868164, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1490 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 0.38758566975593567, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1500 + }, + { + "epoch": 1.3368747233289067, + "grad_norm": 0.3993414044380188, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1510 + }, + { + "epoch": 1.3457281983178397, + "grad_norm": 0.35689303278923035, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1520 + }, + { + "epoch": 1.354581673306773, + "grad_norm": 0.41849321126937866, + "learning_rate": 0.0002, + "loss": 1.7082, + "step": 1530 + }, + { + "epoch": 1.3634351482957061, + "grad_norm": 0.36752554774284363, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1540 + }, + { + "epoch": 1.3722886232846392, + "grad_norm": 0.36915940046310425, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 1550 + }, + { + "epoch": 1.3811420982735725, + "grad_norm": 0.3656710386276245, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1560 + }, + { + "epoch": 1.3899955732625056, + "grad_norm": 0.32055532932281494, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 1570 + }, + { + "epoch": 1.3988490482514386, + "grad_norm": 0.35031241178512573, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1580 + }, + { + "epoch": 1.407702523240372, + "grad_norm": 0.44541189074516296, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 1590 + }, + { + "epoch": 1.416555998229305, + "grad_norm": 0.36922356486320496, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 1600 + }, + { + "epoch": 1.425409473218238, + "grad_norm": 0.3470565974712372, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 1610 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.3743111193180084, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1620 + }, + { + "epoch": 1.4431164231961044, + "grad_norm": 0.3619250953197479, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1630 + }, + { + "epoch": 1.4519698981850375, + "grad_norm": 0.4028145968914032, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 1640 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.36065351963043213, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1650 + }, + { + "epoch": 1.469676848162904, + "grad_norm": 0.44304442405700684, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 1660 + }, + { + "epoch": 1.478530323151837, + "grad_norm": 0.35770007967948914, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1670 + }, + { + "epoch": 1.4873837981407703, + "grad_norm": 0.37584400177001953, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 1680 + }, + { + "epoch": 1.4962372731297033, + "grad_norm": 0.37151241302490234, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 1690 + }, + { + "epoch": 1.5050907481186364, + "grad_norm": 0.36422812938690186, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1700 + }, + { + "epoch": 1.5139442231075697, + "grad_norm": 0.3680015206336975, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1710 + }, + { + "epoch": 1.522797698096503, + "grad_norm": 0.3356926441192627, + "learning_rate": 0.0002, + "loss": 1.6917, + "step": 1720 + }, + { + "epoch": 1.531651173085436, + "grad_norm": 0.37887054681777954, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1730 + }, + { + "epoch": 1.5405046480743692, + "grad_norm": 0.37052762508392334, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1740 + }, + { + "epoch": 1.5493581230633025, + "grad_norm": 0.333925724029541, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 1750 + }, + { + "epoch": 1.5582115980522355, + "grad_norm": 0.3722778558731079, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1760 + }, + { + "epoch": 1.5670650730411686, + "grad_norm": 0.3331141173839569, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 1770 + }, + { + "epoch": 1.575918548030102, + "grad_norm": 0.3670045733451843, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1780 + }, + { + "epoch": 1.584772023019035, + "grad_norm": 0.3769885301589966, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1790 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.4266890287399292, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1800 + }, + { + "epoch": 1.6024789729969013, + "grad_norm": 0.37174347043037415, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1810 + }, + { + "epoch": 1.6113324479858344, + "grad_norm": 0.3599846363067627, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1820 + }, + { + "epoch": 1.6201859229747675, + "grad_norm": 0.3364820182323456, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1830 + }, + { + "epoch": 1.6290393979637008, + "grad_norm": 0.3874799907207489, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 1840 + }, + { + "epoch": 1.6378928729526339, + "grad_norm": 0.3706085681915283, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 1850 + }, + { + "epoch": 1.646746347941567, + "grad_norm": 0.3997809886932373, + "learning_rate": 0.0002, + "loss": 1.6761, + "step": 1860 + }, + { + "epoch": 1.6555998229305002, + "grad_norm": 0.4033166170120239, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 1870 + }, + { + "epoch": 1.6644532979194335, + "grad_norm": 0.3944370150566101, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1880 + }, + { + "epoch": 1.6733067729083664, + "grad_norm": 0.3467825651168823, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1890 + }, + { + "epoch": 1.6821602478972997, + "grad_norm": 0.35290950536727905, + "learning_rate": 0.0002, + "loss": 1.7462, + "step": 1900 + }, + { + "epoch": 1.691013722886233, + "grad_norm": 0.3664521872997284, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1910 + }, + { + "epoch": 1.699867197875166, + "grad_norm": 0.33863595128059387, + "learning_rate": 0.0002, + "loss": 1.7922, + "step": 1920 + }, + { + "epoch": 1.7087206728640991, + "grad_norm": 0.34726113080978394, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 1930 + }, + { + "epoch": 1.7175741478530324, + "grad_norm": 0.35060688853263855, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1940 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 0.33741647005081177, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1950 + }, + { + "epoch": 1.7352810978308986, + "grad_norm": 0.36190304160118103, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1960 + }, + { + "epoch": 1.7441345728198319, + "grad_norm": 0.3412845730781555, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 1970 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 0.3841935694217682, + "learning_rate": 0.0002, + "loss": 1.7038, + "step": 1980 + }, + { + "epoch": 1.761841522797698, + "grad_norm": 0.39062076807022095, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 1990 + }, + { + "epoch": 1.7706949977866313, + "grad_norm": 0.3741697669029236, + "learning_rate": 0.0002, + "loss": 1.7346, + "step": 2000 + }, + { + "epoch": 1.7795484727755644, + "grad_norm": 0.4160231053829193, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2010 + }, + { + "epoch": 1.7884019477644975, + "grad_norm": 0.3602111339569092, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 2020 + }, + { + "epoch": 1.7972554227534308, + "grad_norm": 0.36740878224372864, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2030 + }, + { + "epoch": 1.8061088977423638, + "grad_norm": 0.419039249420166, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 2040 + }, + { + "epoch": 1.814962372731297, + "grad_norm": 0.3511838912963867, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 2050 + }, + { + "epoch": 1.8238158477202302, + "grad_norm": 0.3580166697502136, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2060 + }, + { + "epoch": 1.8326693227091635, + "grad_norm": 0.40928223729133606, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 2070 + }, + { + "epoch": 1.8415227976980963, + "grad_norm": 0.37134310603141785, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 2080 + }, + { + "epoch": 1.8503762726870296, + "grad_norm": 0.3924112319946289, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2090 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.3215042054653168, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 2100 + }, + { + "epoch": 1.868083222664896, + "grad_norm": 0.37674015760421753, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2110 + }, + { + "epoch": 1.876936697653829, + "grad_norm": 0.370856374502182, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 2120 + }, + { + "epoch": 1.8857901726427624, + "grad_norm": 0.35783782601356506, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2130 + }, + { + "epoch": 1.8946436476316955, + "grad_norm": 0.39538058638572693, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 2140 + }, + { + "epoch": 1.9034971226206285, + "grad_norm": 0.36677780747413635, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 2150 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.39032700657844543, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 2160 + }, + { + "epoch": 1.921204072598495, + "grad_norm": 0.39762043952941895, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2170 + }, + { + "epoch": 1.930057547587428, + "grad_norm": 0.5400257110595703, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 2180 + }, + { + "epoch": 1.9389110225763613, + "grad_norm": 0.3650212287902832, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2190 + }, + { + "epoch": 1.9477644975652944, + "grad_norm": 0.3583165109157562, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.9566179725542274, + "grad_norm": 0.4031282365322113, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 2210 + }, + { + "epoch": 1.9654714475431607, + "grad_norm": 0.3673221170902252, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 2220 + }, + { + "epoch": 1.9743249225320938, + "grad_norm": 0.3920327126979828, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 2230 + }, + { + "epoch": 1.9831783975210269, + "grad_norm": 0.4765491783618927, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2240 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.38130584359169006, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2250 + }, + { + "epoch": 2.0, + "eval_loss": 1.8077166080474854, + "eval_runtime": 82.8351, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 0.785, + "step": 2259 + }, + { + "epoch": 2.0008853474988935, + "grad_norm": 0.34340235590934753, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 2260 + }, + { + "epoch": 2.0097388224878263, + "grad_norm": 0.3710762858390808, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2270 + }, + { + "epoch": 2.0185922974767596, + "grad_norm": 0.35640114545822144, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2280 + }, + { + "epoch": 2.027445772465693, + "grad_norm": 0.45970189571380615, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 2290 + }, + { + "epoch": 2.0362992474546258, + "grad_norm": 0.4256797134876251, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 2300 + }, + { + "epoch": 2.045152722443559, + "grad_norm": 0.42421531677246094, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2310 + }, + { + "epoch": 2.0540061974324924, + "grad_norm": 0.4032478928565979, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2320 + }, + { + "epoch": 2.062859672421425, + "grad_norm": 0.4073623716831207, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2330 + }, + { + "epoch": 2.0717131474103585, + "grad_norm": 0.4845200777053833, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2340 + }, + { + "epoch": 2.080566622399292, + "grad_norm": 0.40578293800354004, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2350 + }, + { + "epoch": 2.089420097388225, + "grad_norm": 0.4037284255027771, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2360 + }, + { + "epoch": 2.098273572377158, + "grad_norm": 0.4717613160610199, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2370 + }, + { + "epoch": 2.1071270473660912, + "grad_norm": 0.42076411843299866, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 2380 + }, + { + "epoch": 2.1159805223550245, + "grad_norm": 0.47799113392829895, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 2390 + }, + { + "epoch": 2.1248339973439574, + "grad_norm": 0.4253084063529968, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 2400 + }, + { + "epoch": 2.1336874723328907, + "grad_norm": 0.5023085474967957, + "learning_rate": 0.0002, + "loss": 1.6432, + "step": 2410 + }, + { + "epoch": 2.142540947321824, + "grad_norm": 0.49162712693214417, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2420 + }, + { + "epoch": 2.151394422310757, + "grad_norm": 0.39035019278526306, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2430 + }, + { + "epoch": 2.16024789729969, + "grad_norm": 0.43223854899406433, + "learning_rate": 0.0002, + "loss": 1.7526, + "step": 2440 + }, + { + "epoch": 2.1691013722886234, + "grad_norm": 0.4596616327762604, + "learning_rate": 0.0002, + "loss": 1.6334, + "step": 2450 + }, + { + "epoch": 2.1779548472775563, + "grad_norm": 0.4469447731971741, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 2460 + }, + { + "epoch": 2.1868083222664896, + "grad_norm": 0.5100595355033875, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 2470 + }, + { + "epoch": 2.195661797255423, + "grad_norm": 0.4169430732727051, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2480 + }, + { + "epoch": 2.2045152722443557, + "grad_norm": 0.4699254035949707, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 2490 + }, + { + "epoch": 2.213368747233289, + "grad_norm": 0.43524250388145447, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 2500 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.4496648907661438, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2510 + }, + { + "epoch": 2.231075697211155, + "grad_norm": 0.43408212065696716, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2520 + }, + { + "epoch": 2.2399291722000885, + "grad_norm": 0.4596034288406372, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2530 + }, + { + "epoch": 2.2487826471890218, + "grad_norm": 0.5217021107673645, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2540 + }, + { + "epoch": 2.2576361221779546, + "grad_norm": 0.44745638966560364, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 2550 + }, + { + "epoch": 2.266489597166888, + "grad_norm": 0.4484798014163971, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 2560 + }, + { + "epoch": 2.275343072155821, + "grad_norm": 0.4428067207336426, + "learning_rate": 0.0002, + "loss": 1.5321, + "step": 2570 + }, + { + "epoch": 2.2841965471447545, + "grad_norm": 0.5095171332359314, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2580 + }, + { + "epoch": 2.2930500221336874, + "grad_norm": 0.44833096861839294, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 2590 + }, + { + "epoch": 2.3019034971226207, + "grad_norm": 0.507905900478363, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 2600 + }, + { + "epoch": 2.310756972111554, + "grad_norm": 0.40808171033859253, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2610 + }, + { + "epoch": 2.319610447100487, + "grad_norm": 0.4684814214706421, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2620 + }, + { + "epoch": 2.32846392208942, + "grad_norm": 0.44864922761917114, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2630 + }, + { + "epoch": 2.3373173970783534, + "grad_norm": 0.4174162745475769, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2640 + }, + { + "epoch": 2.3461708720672863, + "grad_norm": 0.42314743995666504, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 2650 + }, + { + "epoch": 2.3550243470562195, + "grad_norm": 0.49224185943603516, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2660 + }, + { + "epoch": 2.363877822045153, + "grad_norm": 0.45190292596817017, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 2670 + }, + { + "epoch": 2.3727312970340857, + "grad_norm": 0.41817107796669006, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 2680 + }, + { + "epoch": 2.381584772023019, + "grad_norm": 0.6436763405799866, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2690 + }, + { + "epoch": 2.3904382470119523, + "grad_norm": 0.47175949811935425, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2700 + }, + { + "epoch": 2.3992917220008856, + "grad_norm": 0.480339378118515, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 2710 + }, + { + "epoch": 2.4081451969898184, + "grad_norm": 0.4723486006259918, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 2720 + }, + { + "epoch": 2.4169986719787517, + "grad_norm": 0.4305492043495178, + "learning_rate": 0.0002, + "loss": 1.54, + "step": 2730 + }, + { + "epoch": 2.425852146967685, + "grad_norm": 0.5007492303848267, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2740 + }, + { + "epoch": 2.434705621956618, + "grad_norm": 0.5374062061309814, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 2750 + }, + { + "epoch": 2.443559096945551, + "grad_norm": 0.45866212248802185, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 2760 + }, + { + "epoch": 2.4524125719344845, + "grad_norm": 0.47914502024650574, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 2770 + }, + { + "epoch": 2.4612660469234173, + "grad_norm": 0.43804746866226196, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 2780 + }, + { + "epoch": 2.4701195219123506, + "grad_norm": 0.43656906485557556, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 2790 + }, + { + "epoch": 2.478972996901284, + "grad_norm": 0.4820363521575928, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 2800 + }, + { + "epoch": 2.4878264718902168, + "grad_norm": 0.4916800558567047, + "learning_rate": 0.0002, + "loss": 1.5375, + "step": 2810 + }, + { + "epoch": 2.49667994687915, + "grad_norm": 0.4521256983280182, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2820 + }, + { + "epoch": 2.5055334218680834, + "grad_norm": 0.5066806674003601, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 2830 + }, + { + "epoch": 2.514386896857016, + "grad_norm": 0.4768151640892029, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2840 + }, + { + "epoch": 2.5232403718459495, + "grad_norm": 0.5144683718681335, + "learning_rate": 0.0002, + "loss": 1.6719, + "step": 2850 + }, + { + "epoch": 2.532093846834883, + "grad_norm": 0.4718942940235138, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2860 + }, + { + "epoch": 2.5409473218238157, + "grad_norm": 0.4924587309360504, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2870 + }, + { + "epoch": 2.549800796812749, + "grad_norm": 0.4649953842163086, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2880 + }, + { + "epoch": 2.5586542718016823, + "grad_norm": 0.4836665987968445, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2890 + }, + { + "epoch": 2.567507746790615, + "grad_norm": 0.4162124991416931, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 2900 + }, + { + "epoch": 2.5763612217795484, + "grad_norm": 0.4894537925720215, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2910 + }, + { + "epoch": 2.5852146967684817, + "grad_norm": 0.4539397358894348, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 2920 + }, + { + "epoch": 2.5940681717574146, + "grad_norm": 0.4718773066997528, + "learning_rate": 0.0002, + "loss": 1.6449, + "step": 2930 + }, + { + "epoch": 2.602921646746348, + "grad_norm": 0.49989837408065796, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 2940 + }, + { + "epoch": 2.611775121735281, + "grad_norm": 0.4862406849861145, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2950 + }, + { + "epoch": 2.620628596724214, + "grad_norm": 0.4244804382324219, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 2960 + }, + { + "epoch": 2.6294820717131473, + "grad_norm": 0.49304354190826416, + "learning_rate": 0.0002, + "loss": 1.7795, + "step": 2970 + }, + { + "epoch": 2.6383355467020806, + "grad_norm": 0.4818236529827118, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 2980 + }, + { + "epoch": 2.647189021691014, + "grad_norm": 0.5077425837516785, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 2990 + }, + { + "epoch": 2.6560424966799467, + "grad_norm": 0.4494157135486603, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 3000 + }, + { + "epoch": 2.66489597166888, + "grad_norm": 0.4790278971195221, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 3010 + }, + { + "epoch": 2.6737494466578133, + "grad_norm": 0.4702624976634979, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3020 + }, + { + "epoch": 2.682602921646746, + "grad_norm": 0.5082133412361145, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3030 + }, + { + "epoch": 2.6914563966356795, + "grad_norm": 0.4553256630897522, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 3040 + }, + { + "epoch": 2.700309871624613, + "grad_norm": 0.4492715001106262, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3050 + }, + { + "epoch": 2.709163346613546, + "grad_norm": 0.4555944502353668, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 3060 + }, + { + "epoch": 2.718016821602479, + "grad_norm": 0.5879693031311035, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3070 + }, + { + "epoch": 2.7268702965914122, + "grad_norm": 0.4628562927246094, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3080 + }, + { + "epoch": 2.7357237715803455, + "grad_norm": 0.5169575810432434, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 3090 + }, + { + "epoch": 2.7445772465692784, + "grad_norm": 0.4630090892314911, + "learning_rate": 0.0002, + "loss": 1.562, + "step": 3100 + }, + { + "epoch": 2.7534307215582117, + "grad_norm": 0.5437219738960266, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 3110 + }, + { + "epoch": 2.762284196547145, + "grad_norm": 0.5102152228355408, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 3120 + }, + { + "epoch": 2.771137671536078, + "grad_norm": 0.48287826776504517, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3130 + }, + { + "epoch": 2.779991146525011, + "grad_norm": 0.4671737253665924, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 3140 + }, + { + "epoch": 2.7888446215139444, + "grad_norm": 0.5177035331726074, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3150 + }, + { + "epoch": 2.7976980965028773, + "grad_norm": 0.450989305973053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 3160 + }, + { + "epoch": 2.8065515714918106, + "grad_norm": 0.45007848739624023, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3170 + }, + { + "epoch": 2.815405046480744, + "grad_norm": 0.4600294530391693, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3180 + }, + { + "epoch": 2.8242585214696767, + "grad_norm": 0.485628604888916, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 3190 + }, + { + "epoch": 2.83311199645861, + "grad_norm": 0.49811574816703796, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 3200 + }, + { + "epoch": 2.8419654714475433, + "grad_norm": 0.5012516975402832, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 3210 + }, + { + "epoch": 2.850818946436476, + "grad_norm": 0.4552757740020752, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 3220 + }, + { + "epoch": 2.8596724214254094, + "grad_norm": 0.4539635479450226, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 3230 + }, + { + "epoch": 2.8685258964143427, + "grad_norm": 0.5534685850143433, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 3240 + }, + { + "epoch": 2.8773793714032756, + "grad_norm": 0.4570811688899994, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3250 + }, + { + "epoch": 2.886232846392209, + "grad_norm": 0.48181653022766113, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3260 + }, + { + "epoch": 2.895086321381142, + "grad_norm": 0.4871032238006592, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 3270 + }, + { + "epoch": 2.903939796370075, + "grad_norm": 0.4643239676952362, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3280 + }, + { + "epoch": 2.9127932713590083, + "grad_norm": 0.5024484395980835, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3290 + }, + { + "epoch": 2.9216467463479416, + "grad_norm": 0.4425384998321533, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 3300 + }, + { + "epoch": 2.9305002213368745, + "grad_norm": 0.459168016910553, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 3310 + }, + { + "epoch": 2.939353696325808, + "grad_norm": 0.4950717091560364, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 3320 + }, + { + "epoch": 2.948207171314741, + "grad_norm": 0.4516230523586273, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3330 + }, + { + "epoch": 2.957060646303674, + "grad_norm": 0.49523285031318665, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3340 + }, + { + "epoch": 2.9659141212926072, + "grad_norm": 0.49282631278038025, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 3350 + }, + { + "epoch": 2.9747675962815405, + "grad_norm": 0.45825016498565674, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 3360 + }, + { + "epoch": 2.983621071270474, + "grad_norm": 0.4952891170978546, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 3370 + }, + { + "epoch": 2.9924745462594067, + "grad_norm": 0.42182639241218567, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3380 + }, + { + "epoch": 2.9995573262505535, + "eval_loss": 1.8308420181274414, + "eval_runtime": 82.786, + "eval_samples_per_second": 6.221, + "eval_steps_per_second": 0.785, + "step": 3388 + }, + { + "epoch": 3.00132802124834, + "grad_norm": 0.47721418738365173, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 3390 + }, + { + "epoch": 3.0101814962372733, + "grad_norm": 0.5284923911094666, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 3400 + }, + { + "epoch": 3.019034971226206, + "grad_norm": 0.5607061982154846, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 3410 + }, + { + "epoch": 3.0278884462151394, + "grad_norm": 0.5271363258361816, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 3420 + }, + { + "epoch": 3.0367419212040727, + "grad_norm": 0.48660898208618164, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 3430 + }, + { + "epoch": 3.0455953961930056, + "grad_norm": 0.5767933130264282, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 3440 + }, + { + "epoch": 3.054448871181939, + "grad_norm": 0.5591282248497009, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3450 + }, + { + "epoch": 3.063302346170872, + "grad_norm": 0.5870814323425293, + "learning_rate": 0.0002, + "loss": 1.5112, + "step": 3460 + }, + { + "epoch": 3.072155821159805, + "grad_norm": 0.4861546456813812, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 3470 + }, + { + "epoch": 3.0810092961487383, + "grad_norm": 0.5238925814628601, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 3480 + }, + { + "epoch": 3.0898627711376716, + "grad_norm": 0.5521751046180725, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 3490 + }, + { + "epoch": 3.098716246126605, + "grad_norm": 0.5816575884819031, + "learning_rate": 0.0002, + "loss": 1.4454, + "step": 3500 + }, + { + "epoch": 3.1075697211155378, + "grad_norm": 0.5281513333320618, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 3510 + }, + { + "epoch": 3.116423196104471, + "grad_norm": 0.5847303867340088, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 3520 + }, + { + "epoch": 3.1252766710934043, + "grad_norm": 0.5683517456054688, + "learning_rate": 0.0002, + "loss": 1.5513, + "step": 3530 + }, + { + "epoch": 3.134130146082337, + "grad_norm": 0.5177015662193298, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 3540 + }, + { + "epoch": 3.1429836210712705, + "grad_norm": 0.5922423601150513, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 3550 + }, + { + "epoch": 3.151837096060204, + "grad_norm": 0.7018587589263916, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 3560 + }, + { + "epoch": 3.1606905710491366, + "grad_norm": 0.6152004599571228, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 3570 + }, + { + "epoch": 3.16954404603807, + "grad_norm": 0.5350717902183533, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 3580 + }, + { + "epoch": 3.1783975210270032, + "grad_norm": 0.5971009731292725, + "learning_rate": 0.0002, + "loss": 1.4739, + "step": 3590 + }, + { + "epoch": 3.187250996015936, + "grad_norm": 0.7312001585960388, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3600 + }, + { + "epoch": 3.1961044710048694, + "grad_norm": 0.6372535228729248, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 3610 + }, + { + "epoch": 3.2049579459938027, + "grad_norm": 0.6098020672798157, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 3620 + }, + { + "epoch": 3.2138114209827355, + "grad_norm": 0.5506435632705688, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3630 + }, + { + "epoch": 3.222664895971669, + "grad_norm": 0.6043022274971008, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3640 + }, + { + "epoch": 3.231518370960602, + "grad_norm": 0.5495519042015076, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3650 + }, + { + "epoch": 3.240371845949535, + "grad_norm": 0.5769572257995605, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3660 + }, + { + "epoch": 3.2492253209384683, + "grad_norm": 0.6833786964416504, + "learning_rate": 0.0002, + "loss": 1.4604, + "step": 3670 + }, + { + "epoch": 3.2580787959274016, + "grad_norm": 0.6962856650352478, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3680 + }, + { + "epoch": 3.2669322709163344, + "grad_norm": 0.6553098559379578, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 3690 + }, + { + "epoch": 3.2757857459052677, + "grad_norm": 0.5907557010650635, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 3700 + }, + { + "epoch": 3.284639220894201, + "grad_norm": 0.5712862014770508, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 3710 + }, + { + "epoch": 3.2934926958831343, + "grad_norm": 0.573820948600769, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 3720 + }, + { + "epoch": 3.302346170872067, + "grad_norm": 0.6650304198265076, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 3730 + }, + { + "epoch": 3.3111996458610005, + "grad_norm": 0.5182583928108215, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 3740 + }, + { + "epoch": 3.3200531208499338, + "grad_norm": 0.5078902840614319, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 3750 + }, + { + "epoch": 3.3289065958388666, + "grad_norm": 0.7062374353408813, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 3760 + }, + { + "epoch": 3.3377600708278, + "grad_norm": 0.5711262822151184, + "learning_rate": 0.0002, + "loss": 1.5017, + "step": 3770 + }, + { + "epoch": 3.346613545816733, + "grad_norm": 0.5624606013298035, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 3780 + }, + { + "epoch": 3.355467020805666, + "grad_norm": 0.6008231043815613, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 3790 + }, + { + "epoch": 3.3643204957945994, + "grad_norm": 0.6120018362998962, + "learning_rate": 0.0002, + "loss": 1.5038, + "step": 3800 + }, + { + "epoch": 3.3731739707835326, + "grad_norm": 0.5679979920387268, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3810 + }, + { + "epoch": 3.3820274457724655, + "grad_norm": 0.5613794922828674, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3820 + }, + { + "epoch": 3.390880920761399, + "grad_norm": 0.5328839421272278, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 3830 + }, + { + "epoch": 3.399734395750332, + "grad_norm": 0.5960017442703247, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 3840 + }, + { + "epoch": 3.4085878707392654, + "grad_norm": 0.5264106392860413, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 3850 + }, + { + "epoch": 3.4174413457281982, + "grad_norm": 0.6378359198570251, + "learning_rate": 0.0002, + "loss": 1.4766, + "step": 3860 + }, + { + "epoch": 3.4262948207171315, + "grad_norm": 0.5792967677116394, + "learning_rate": 0.0002, + "loss": 1.4898, + "step": 3870 + }, + { + "epoch": 3.435148295706065, + "grad_norm": 0.6836280822753906, + "learning_rate": 0.0002, + "loss": 1.4914, + "step": 3880 + }, + { + "epoch": 3.4440017706949977, + "grad_norm": 0.6073971390724182, + "learning_rate": 0.0002, + "loss": 1.5002, + "step": 3890 + }, + { + "epoch": 3.452855245683931, + "grad_norm": 0.5753195881843567, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 3900 + }, + { + "epoch": 3.4617087206728643, + "grad_norm": 0.6007646918296814, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3910 + }, + { + "epoch": 3.470562195661797, + "grad_norm": 0.6025636196136475, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3920 + }, + { + "epoch": 3.4794156706507304, + "grad_norm": 0.6819562315940857, + "learning_rate": 0.0002, + "loss": 1.4612, + "step": 3930 + }, + { + "epoch": 3.4882691456396637, + "grad_norm": 0.6448395848274231, + "learning_rate": 0.0002, + "loss": 1.518, + "step": 3940 + }, + { + "epoch": 3.4971226206285966, + "grad_norm": 0.5712178945541382, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 3950 + }, + { + "epoch": 3.50597609561753, + "grad_norm": 0.6300532817840576, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 3960 + }, + { + "epoch": 3.514829570606463, + "grad_norm": 0.6120840907096863, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 3970 + }, + { + "epoch": 3.523683045595396, + "grad_norm": 0.6887575387954712, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 3980 + }, + { + "epoch": 3.5325365205843293, + "grad_norm": 0.6970235109329224, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 3990 + }, + { + "epoch": 3.5413899955732626, + "grad_norm": 0.5818213820457458, + "learning_rate": 0.0002, + "loss": 1.5198, + "step": 4000 + }, + { + "epoch": 3.5502434705621955, + "grad_norm": 1.0533310174942017, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 4010 + }, + { + "epoch": 3.5590969455511288, + "grad_norm": 0.5444280505180359, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4020 + }, + { + "epoch": 3.567950420540062, + "grad_norm": 0.6007506847381592, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 4030 + }, + { + "epoch": 3.576803895528995, + "grad_norm": 0.6088743805885315, + "learning_rate": 0.0002, + "loss": 1.5059, + "step": 4040 + }, + { + "epoch": 3.585657370517928, + "grad_norm": 0.5934239029884338, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 4050 + }, + { + "epoch": 3.5945108455068615, + "grad_norm": 0.605251669883728, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 4060 + }, + { + "epoch": 3.6033643204957944, + "grad_norm": 0.5903469920158386, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 4070 + }, + { + "epoch": 3.6122177954847277, + "grad_norm": 0.6752413511276245, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4080 + }, + { + "epoch": 3.621071270473661, + "grad_norm": 0.5810418725013733, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4090 + }, + { + "epoch": 3.629924745462594, + "grad_norm": 0.5918573141098022, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 4100 + }, + { + "epoch": 3.638778220451527, + "grad_norm": 0.6635358333587646, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 4110 + }, + { + "epoch": 3.6476316954404604, + "grad_norm": 0.5785038471221924, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4120 + }, + { + "epoch": 3.6564851704293937, + "grad_norm": 0.5837879776954651, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4130 + }, + { + "epoch": 3.6653386454183265, + "grad_norm": 0.6449324488639832, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4140 + }, + { + "epoch": 3.67419212040726, + "grad_norm": 0.6191908717155457, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4150 + }, + { + "epoch": 3.683045595396193, + "grad_norm": 0.6937987208366394, + "learning_rate": 0.0002, + "loss": 1.4567, + "step": 4160 + }, + { + "epoch": 3.6918990703851264, + "grad_norm": 0.581128716468811, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 4170 + }, + { + "epoch": 3.7007525453740593, + "grad_norm": 0.6547803282737732, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 4180 + }, + { + "epoch": 3.7096060203629926, + "grad_norm": 0.5961150527000427, + "learning_rate": 0.0002, + "loss": 1.4653, + "step": 4190 + }, + { + "epoch": 3.718459495351926, + "grad_norm": 0.6197913885116577, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 4200 + }, + { + "epoch": 3.7273129703408587, + "grad_norm": 0.688565194606781, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 4210 + }, + { + "epoch": 3.736166445329792, + "grad_norm": 0.5832270979881287, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 4220 + }, + { + "epoch": 3.7450199203187253, + "grad_norm": 0.5643884539604187, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 4230 + }, + { + "epoch": 3.753873395307658, + "grad_norm": 0.6236484050750732, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4240 + }, + { + "epoch": 3.7627268702965915, + "grad_norm": 0.5367720127105713, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 4250 + }, + { + "epoch": 3.7715803452855248, + "grad_norm": 0.5785109400749207, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4260 + }, + { + "epoch": 3.7804338202744576, + "grad_norm": 0.5698465704917908, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4270 + }, + { + "epoch": 3.789287295263391, + "grad_norm": 0.5748036503791809, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4280 + }, + { + "epoch": 3.798140770252324, + "grad_norm": 0.608147382736206, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 4290 + }, + { + "epoch": 3.806994245241257, + "grad_norm": 0.5820456147193909, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4300 + }, + { + "epoch": 3.8158477202301904, + "grad_norm": 0.6325612664222717, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4310 + }, + { + "epoch": 3.8247011952191237, + "grad_norm": 0.6465362310409546, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4320 + }, + { + "epoch": 3.8335546702080565, + "grad_norm": 0.5630854368209839, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4330 + }, + { + "epoch": 3.84240814519699, + "grad_norm": 0.6181462407112122, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4340 + }, + { + "epoch": 3.851261620185923, + "grad_norm": 0.6207571029663086, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 4350 + }, + { + "epoch": 3.860115095174856, + "grad_norm": 0.6092919111251831, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4360 + }, + { + "epoch": 3.8689685701637893, + "grad_norm": 0.6140493750572205, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 4370 + }, + { + "epoch": 3.8778220451527226, + "grad_norm": 0.611575722694397, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 4380 + }, + { + "epoch": 3.8866755201416554, + "grad_norm": 0.6288794279098511, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 4390 + }, + { + "epoch": 3.8955289951305887, + "grad_norm": 0.6518979072570801, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4400 + }, + { + "epoch": 3.904382470119522, + "grad_norm": 0.6144753098487854, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4410 + }, + { + "epoch": 3.913235945108455, + "grad_norm": 0.7034937143325806, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 4420 + }, + { + "epoch": 3.922089420097388, + "grad_norm": 0.5713187456130981, + "learning_rate": 0.0002, + "loss": 1.4978, + "step": 4430 + }, + { + "epoch": 3.9309428950863214, + "grad_norm": 0.6187576651573181, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4440 + }, + { + "epoch": 3.9397963700752543, + "grad_norm": 0.6439383029937744, + "learning_rate": 0.0002, + "loss": 1.551, + "step": 4450 + }, + { + "epoch": 3.9486498450641876, + "grad_norm": 0.6133334636688232, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 4460 + }, + { + "epoch": 3.957503320053121, + "grad_norm": 0.593463659286499, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4470 + }, + { + "epoch": 3.9663567950420537, + "grad_norm": 0.6261998414993286, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4480 + }, + { + "epoch": 3.975210270030987, + "grad_norm": 0.6153767704963684, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4490 + }, + { + "epoch": 3.9840637450199203, + "grad_norm": 0.6184002757072449, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 4500 + }, + { + "epoch": 3.9929172200088536, + "grad_norm": 0.5212734341621399, + "learning_rate": 0.0002, + "loss": 1.5134, + "step": 4510 + }, + { + "epoch": 4.0, + "eval_loss": 1.8745536804199219, + "eval_runtime": 83.0125, + "eval_samples_per_second": 6.204, + "eval_steps_per_second": 0.783, + "step": 4518 + } + ], + "logging_steps": 10, + "max_steps": 9032, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.090829237882716e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-4518/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b71f87cc25f4d90636498eea96644a7ebf519739 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b74d45e92063060d6b620d55080eb3100e930e23efc7021314f9aba7cde8315 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..27aa6a8eb16f63428d12bb537227be4f2229a967 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4481341c634f71a814465727ee1802bf672a7f6c45eea65cee76b6b51c3ff67b +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b918ec32b95edb8e7a0d96fee454f523ef97c08 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4699db14fbf71571777c4c6c2b4389b724cf330f03b562203a56cfbaf5e3d92b +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..10b99d3c960d63359c70caf9eaf811f566968a3f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd6de7ec891b36b0e16381f6d9f29430d99a0933bbf61e8bedda5ba05e3b345 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e47abc0a7b942ce7129f8ac50d35f7b8784cc1ff --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/trainer_state.json @@ -0,0 +1,4021 @@ +{ + "best_metric": 1.8077166080474854, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", + "epoch": 4.999557326250553, + "eval_steps": 10, + "global_step": 5647, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008853474988933156, + "grad_norm": 0.4775333106517792, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 10 + }, + { + "epoch": 0.017706949977866312, + "grad_norm": 0.5485824346542358, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 20 + }, + { + "epoch": 0.02656042496679947, + "grad_norm": 0.5675218105316162, + "learning_rate": 0.0002, + "loss": 2.0984, + "step": 30 + }, + { + "epoch": 0.035413899955732624, + "grad_norm": 0.696494460105896, + "learning_rate": 0.0002, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.04426737494466578, + "grad_norm": 0.4788398742675781, + "learning_rate": 0.0002, + "loss": 1.9547, + "step": 50 + }, + { + "epoch": 0.05312084993359894, + "grad_norm": 0.4763128161430359, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 60 + }, + { + "epoch": 0.0619743249225321, + "grad_norm": 0.5929698348045349, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 70 + }, + { + "epoch": 0.07082779991146525, + "grad_norm": 0.5899396538734436, + "learning_rate": 0.0002, + "loss": 1.9573, + "step": 80 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.460123747587204, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 90 + }, + { + "epoch": 0.08853474988933156, + "grad_norm": 0.4184812009334564, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 100 + }, + { + "epoch": 0.09738822487826472, + "grad_norm": 0.4051891267299652, + "learning_rate": 0.0002, + "loss": 1.8079, + "step": 110 + }, + { + "epoch": 0.10624169986719788, + "grad_norm": 0.3709661066532135, + "learning_rate": 0.0002, + "loss": 1.8911, + "step": 120 + }, + { + "epoch": 0.11509517485613104, + "grad_norm": 0.4783487915992737, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 130 + }, + { + "epoch": 0.1239486498450642, + "grad_norm": 0.36478137969970703, + "learning_rate": 0.0002, + "loss": 1.8602, + "step": 140 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.4005294442176819, + "learning_rate": 0.0002, + "loss": 1.7814, + "step": 150 + }, + { + "epoch": 0.1416555998229305, + "grad_norm": 0.42357513308525085, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 160 + }, + { + "epoch": 0.15050907481186365, + "grad_norm": 0.3913971781730652, + "learning_rate": 0.0002, + "loss": 1.8835, + "step": 170 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.4650019407272339, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 180 + }, + { + "epoch": 0.16821602478972997, + "grad_norm": 0.5545958876609802, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 190 + }, + { + "epoch": 0.17706949977866313, + "grad_norm": 0.3669356107711792, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 200 + }, + { + "epoch": 0.18592297476759628, + "grad_norm": 0.3683622181415558, + "learning_rate": 0.0002, + "loss": 1.8169, + "step": 210 + }, + { + "epoch": 0.19477644975652944, + "grad_norm": 0.39825671911239624, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 220 + }, + { + "epoch": 0.2036299247454626, + "grad_norm": 0.4298318326473236, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 230 + }, + { + "epoch": 0.21248339973439576, + "grad_norm": 0.36111244559288025, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 240 + }, + { + "epoch": 0.2213368747233289, + "grad_norm": 0.3711858093738556, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 250 + }, + { + "epoch": 0.23019034971226207, + "grad_norm": 0.37717559933662415, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 260 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.3678877651691437, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 270 + }, + { + "epoch": 0.2478972996901284, + "grad_norm": 0.4165912866592407, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 280 + }, + { + "epoch": 0.25675077467906154, + "grad_norm": 0.3403240740299225, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 290 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.4023234248161316, + "learning_rate": 0.0002, + "loss": 1.8704, + "step": 300 + }, + { + "epoch": 0.27445772465692786, + "grad_norm": 0.32472360134124756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 310 + }, + { + "epoch": 0.283311199645861, + "grad_norm": 0.36464595794677734, + "learning_rate": 0.0002, + "loss": 1.8544, + "step": 320 + }, + { + "epoch": 0.2921646746347942, + "grad_norm": 0.3868598937988281, + "learning_rate": 0.0002, + "loss": 1.8168, + "step": 330 + }, + { + "epoch": 0.3010181496237273, + "grad_norm": 0.3123539686203003, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 340 + }, + { + "epoch": 0.3098716246126605, + "grad_norm": 0.3392639458179474, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 350 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.42070651054382324, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 360 + }, + { + "epoch": 0.3275785745905268, + "grad_norm": 0.3650900423526764, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 370 + }, + { + "epoch": 0.33643204957945994, + "grad_norm": 0.41388973593711853, + "learning_rate": 0.0002, + "loss": 1.8388, + "step": 380 + }, + { + "epoch": 0.3452855245683931, + "grad_norm": 0.36625272035598755, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 390 + }, + { + "epoch": 0.35413899955732625, + "grad_norm": 0.3930284082889557, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 400 + }, + { + "epoch": 0.3629924745462594, + "grad_norm": 0.3415820300579071, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 410 + }, + { + "epoch": 0.37184594953519257, + "grad_norm": 0.4256570041179657, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 420 + }, + { + "epoch": 0.3806994245241257, + "grad_norm": 0.3740842938423157, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 430 + }, + { + "epoch": 0.3895528995130589, + "grad_norm": 0.334108829498291, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 440 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.33186739683151245, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 450 + }, + { + "epoch": 0.4072598494909252, + "grad_norm": 0.39127954840660095, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 460 + }, + { + "epoch": 0.4161133244798583, + "grad_norm": 0.331443727016449, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 470 + }, + { + "epoch": 0.4249667994687915, + "grad_norm": 0.36834150552749634, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 480 + }, + { + "epoch": 0.43382027445772464, + "grad_norm": 0.338123619556427, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 490 + }, + { + "epoch": 0.4426737494466578, + "grad_norm": 0.3891060948371887, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 500 + }, + { + "epoch": 0.45152722443559096, + "grad_norm": 0.3486529290676117, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 510 + }, + { + "epoch": 0.46038069942452414, + "grad_norm": 0.3635135889053345, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 520 + }, + { + "epoch": 0.4692341744134573, + "grad_norm": 0.7706693410873413, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 530 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.33725443482398987, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 540 + }, + { + "epoch": 0.4869411243913236, + "grad_norm": 0.3127504289150238, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 550 + }, + { + "epoch": 0.4957945993802568, + "grad_norm": 0.3527977466583252, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 560 + }, + { + "epoch": 0.5046480743691899, + "grad_norm": 0.3574548661708832, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 570 + }, + { + "epoch": 0.5135015493581231, + "grad_norm": 0.32787248492240906, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 580 + }, + { + "epoch": 0.5223550243470563, + "grad_norm": 0.3309430778026581, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 590 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 0.34276407957077026, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 600 + }, + { + "epoch": 0.5400619743249225, + "grad_norm": 0.3343711495399475, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 610 + }, + { + "epoch": 0.5489154493138557, + "grad_norm": 0.3193040192127228, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 620 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.3059828579425812, + "learning_rate": 0.0002, + "loss": 1.7769, + "step": 630 + }, + { + "epoch": 0.566622399291722, + "grad_norm": 0.37237173318862915, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 640 + }, + { + "epoch": 0.5754758742806552, + "grad_norm": 0.36022549867630005, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 650 + }, + { + "epoch": 0.5843293492695883, + "grad_norm": 0.34974920749664307, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 660 + }, + { + "epoch": 0.5931828242585214, + "grad_norm": 0.37135401368141174, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.6020362992474546, + "grad_norm": 0.3385699689388275, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 680 + }, + { + "epoch": 0.6108897742363878, + "grad_norm": 0.36015814542770386, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 690 + }, + { + "epoch": 0.619743249225321, + "grad_norm": 0.3503795564174652, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 700 + }, + { + "epoch": 0.628596724214254, + "grad_norm": 0.3447190225124359, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 710 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.3193499445915222, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 720 + }, + { + "epoch": 0.6463036741921204, + "grad_norm": 0.37058180570602417, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 730 + }, + { + "epoch": 0.6551571491810536, + "grad_norm": 0.42216411232948303, + "learning_rate": 0.0002, + "loss": 1.8391, + "step": 740 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.3091185688972473, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 750 + }, + { + "epoch": 0.6728640991589199, + "grad_norm": 0.33168601989746094, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 760 + }, + { + "epoch": 0.6817175741478531, + "grad_norm": 0.31269341707229614, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 770 + }, + { + "epoch": 0.6905710491367862, + "grad_norm": 0.36125293374061584, + "learning_rate": 0.0002, + "loss": 1.8526, + "step": 780 + }, + { + "epoch": 0.6994245241257193, + "grad_norm": 0.3145293593406677, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 790 + }, + { + "epoch": 0.7082779991146525, + "grad_norm": 0.3611990809440613, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.3165971636772156, + "learning_rate": 0.0002, + "loss": 1.892, + "step": 810 + }, + { + "epoch": 0.7259849490925188, + "grad_norm": 0.3364323675632477, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 820 + }, + { + "epoch": 0.734838424081452, + "grad_norm": 0.4310600757598877, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 830 + }, + { + "epoch": 0.7436918990703851, + "grad_norm": 0.3414389491081238, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 840 + }, + { + "epoch": 0.7525453740593183, + "grad_norm": 0.35536202788352966, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 850 + }, + { + "epoch": 0.7613988490482514, + "grad_norm": 0.3232460618019104, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 860 + }, + { + "epoch": 0.7702523240371846, + "grad_norm": 0.32734858989715576, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 870 + }, + { + "epoch": 0.7791057990261178, + "grad_norm": 0.3433493673801422, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 880 + }, + { + "epoch": 0.787959274015051, + "grad_norm": 0.33354780077934265, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 890 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.30728545784950256, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 900 + }, + { + "epoch": 0.8056662239929172, + "grad_norm": 0.3373030126094818, + "learning_rate": 0.0002, + "loss": 1.8267, + "step": 910 + }, + { + "epoch": 0.8145196989818504, + "grad_norm": 0.3468782603740692, + "learning_rate": 0.0002, + "loss": 1.8479, + "step": 920 + }, + { + "epoch": 0.8233731739707836, + "grad_norm": 0.33520200848579407, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 930 + }, + { + "epoch": 0.8322266489597167, + "grad_norm": 0.35207098722457886, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 0.8410801239486498, + "grad_norm": 0.4000207483768463, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 950 + }, + { + "epoch": 0.849933598937583, + "grad_norm": 0.35362836718559265, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 960 + }, + { + "epoch": 0.8587870739265162, + "grad_norm": 0.3470745086669922, + "learning_rate": 0.0002, + "loss": 1.7497, + "step": 970 + }, + { + "epoch": 0.8676405489154493, + "grad_norm": 0.31602704524993896, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 980 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.3062942326068878, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 990 + }, + { + "epoch": 0.8853474988933157, + "grad_norm": 0.36963850259780884, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1000 + }, + { + "epoch": 0.8942009738822487, + "grad_norm": 0.3384034037590027, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1010 + }, + { + "epoch": 0.9030544488711819, + "grad_norm": 0.30436110496520996, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 1020 + }, + { + "epoch": 0.9119079238601151, + "grad_norm": 3.499784469604492, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1030 + }, + { + "epoch": 0.9207613988490483, + "grad_norm": 0.3130280375480652, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1040 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.29976674914360046, + "learning_rate": 0.0002, + "loss": 1.7527, + "step": 1050 + }, + { + "epoch": 0.9384683488269145, + "grad_norm": 0.35852617025375366, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 1060 + }, + { + "epoch": 0.9473218238158477, + "grad_norm": 0.3288591504096985, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1070 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.32641634345054626, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1080 + }, + { + "epoch": 0.965028773793714, + "grad_norm": 0.3305715322494507, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1090 + }, + { + "epoch": 0.9738822487826472, + "grad_norm": 0.30650773644447327, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1100 + }, + { + "epoch": 0.9827357237715804, + "grad_norm": 0.3330624997615814, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1110 + }, + { + "epoch": 0.9915891987605135, + "grad_norm": 0.3173314034938812, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 1120 + }, + { + "epoch": 0.9995573262505534, + "eval_loss": 1.8095673322677612, + "eval_runtime": 82.6312, + "eval_samples_per_second": 6.233, + "eval_steps_per_second": 0.787, + "step": 1129 + }, + { + "epoch": 1.0004426737494467, + "grad_norm": 0.3092995882034302, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1130 + }, + { + "epoch": 1.0092961487383798, + "grad_norm": 0.34386494755744934, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1140 + }, + { + "epoch": 1.0181496237273129, + "grad_norm": 0.2887897789478302, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 1150 + }, + { + "epoch": 1.0270030987162462, + "grad_norm": 0.3706893026828766, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1160 + }, + { + "epoch": 1.0358565737051793, + "grad_norm": 0.34724316000938416, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 1170 + }, + { + "epoch": 1.0447100486941125, + "grad_norm": 0.41001757979393005, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1180 + }, + { + "epoch": 1.0535635236830456, + "grad_norm": 0.34838348627090454, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 1190 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 0.37201181054115295, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1200 + }, + { + "epoch": 1.071270473660912, + "grad_norm": 0.36871352791786194, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1210 + }, + { + "epoch": 1.080123948649845, + "grad_norm": 0.35687458515167236, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 1220 + }, + { + "epoch": 1.0889774236387781, + "grad_norm": 0.3864741921424866, + "learning_rate": 0.0002, + "loss": 1.7235, + "step": 1230 + }, + { + "epoch": 1.0978308986277114, + "grad_norm": 0.3496808707714081, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1240 + }, + { + "epoch": 1.1066843736166445, + "grad_norm": 0.3444930911064148, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 1250 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.353188693523407, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1260 + }, + { + "epoch": 1.1243913235945109, + "grad_norm": 0.3284400999546051, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1270 + }, + { + "epoch": 1.133244798583444, + "grad_norm": 0.3545348644256592, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1280 + }, + { + "epoch": 1.1420982735723773, + "grad_norm": 0.3489900529384613, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1290 + }, + { + "epoch": 1.1509517485613103, + "grad_norm": 0.40355560183525085, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 1300 + }, + { + "epoch": 1.1598052235502434, + "grad_norm": 0.3369944095611572, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 1310 + }, + { + "epoch": 1.1686586985391767, + "grad_norm": 0.39141345024108887, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1320 + }, + { + "epoch": 1.1775121735281098, + "grad_norm": 0.36518552899360657, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 1330 + }, + { + "epoch": 1.1863656485170428, + "grad_norm": 0.3730056583881378, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1340 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 0.37711501121520996, + "learning_rate": 0.0002, + "loss": 1.7613, + "step": 1350 + }, + { + "epoch": 1.2040725984949092, + "grad_norm": 0.3627128005027771, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 1360 + }, + { + "epoch": 1.2129260734838425, + "grad_norm": 0.3458651006221771, + "learning_rate": 0.0002, + "loss": 1.7214, + "step": 1370 + }, + { + "epoch": 1.2217795484727756, + "grad_norm": 0.392395555973053, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1380 + }, + { + "epoch": 1.2306330234617087, + "grad_norm": 0.3353286683559418, + "learning_rate": 0.0002, + "loss": 1.7785, + "step": 1390 + }, + { + "epoch": 1.239486498450642, + "grad_norm": 0.9545007944107056, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 1400 + }, + { + "epoch": 1.248339973439575, + "grad_norm": 0.37037935853004456, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1410 + }, + { + "epoch": 1.257193448428508, + "grad_norm": 0.3831497132778168, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1420 + }, + { + "epoch": 1.2660469234174414, + "grad_norm": 0.4633576273918152, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 1430 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.3690567910671234, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 1440 + }, + { + "epoch": 1.2837538733953076, + "grad_norm": 0.33980098366737366, + "learning_rate": 0.0002, + "loss": 1.767, + "step": 1450 + }, + { + "epoch": 1.2926073483842409, + "grad_norm": 0.3731277287006378, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1460 + }, + { + "epoch": 1.301460823373174, + "grad_norm": 0.3781551122665405, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 1470 + }, + { + "epoch": 1.310314298362107, + "grad_norm": 0.36511561274528503, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1480 + }, + { + "epoch": 1.3191677733510403, + "grad_norm": 0.3292245864868164, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1490 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 0.38758566975593567, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1500 + }, + { + "epoch": 1.3368747233289067, + "grad_norm": 0.3993414044380188, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1510 + }, + { + "epoch": 1.3457281983178397, + "grad_norm": 0.35689303278923035, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1520 + }, + { + "epoch": 1.354581673306773, + "grad_norm": 0.41849321126937866, + "learning_rate": 0.0002, + "loss": 1.7082, + "step": 1530 + }, + { + "epoch": 1.3634351482957061, + "grad_norm": 0.36752554774284363, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1540 + }, + { + "epoch": 1.3722886232846392, + "grad_norm": 0.36915940046310425, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 1550 + }, + { + "epoch": 1.3811420982735725, + "grad_norm": 0.3656710386276245, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1560 + }, + { + "epoch": 1.3899955732625056, + "grad_norm": 0.32055532932281494, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 1570 + }, + { + "epoch": 1.3988490482514386, + "grad_norm": 0.35031241178512573, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1580 + }, + { + "epoch": 1.407702523240372, + "grad_norm": 0.44541189074516296, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 1590 + }, + { + "epoch": 1.416555998229305, + "grad_norm": 0.36922356486320496, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 1600 + }, + { + "epoch": 1.425409473218238, + "grad_norm": 0.3470565974712372, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 1610 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.3743111193180084, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1620 + }, + { + "epoch": 1.4431164231961044, + "grad_norm": 0.3619250953197479, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1630 + }, + { + "epoch": 1.4519698981850375, + "grad_norm": 0.4028145968914032, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 1640 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.36065351963043213, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1650 + }, + { + "epoch": 1.469676848162904, + "grad_norm": 0.44304442405700684, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 1660 + }, + { + "epoch": 1.478530323151837, + "grad_norm": 0.35770007967948914, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1670 + }, + { + "epoch": 1.4873837981407703, + "grad_norm": 0.37584400177001953, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 1680 + }, + { + "epoch": 1.4962372731297033, + "grad_norm": 0.37151241302490234, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 1690 + }, + { + "epoch": 1.5050907481186364, + "grad_norm": 0.36422812938690186, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1700 + }, + { + "epoch": 1.5139442231075697, + "grad_norm": 0.3680015206336975, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1710 + }, + { + "epoch": 1.522797698096503, + "grad_norm": 0.3356926441192627, + "learning_rate": 0.0002, + "loss": 1.6917, + "step": 1720 + }, + { + "epoch": 1.531651173085436, + "grad_norm": 0.37887054681777954, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1730 + }, + { + "epoch": 1.5405046480743692, + "grad_norm": 0.37052762508392334, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1740 + }, + { + "epoch": 1.5493581230633025, + "grad_norm": 0.333925724029541, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 1750 + }, + { + "epoch": 1.5582115980522355, + "grad_norm": 0.3722778558731079, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1760 + }, + { + "epoch": 1.5670650730411686, + "grad_norm": 0.3331141173839569, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 1770 + }, + { + "epoch": 1.575918548030102, + "grad_norm": 0.3670045733451843, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1780 + }, + { + "epoch": 1.584772023019035, + "grad_norm": 0.3769885301589966, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1790 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.4266890287399292, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1800 + }, + { + "epoch": 1.6024789729969013, + "grad_norm": 0.37174347043037415, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1810 + }, + { + "epoch": 1.6113324479858344, + "grad_norm": 0.3599846363067627, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1820 + }, + { + "epoch": 1.6201859229747675, + "grad_norm": 0.3364820182323456, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1830 + }, + { + "epoch": 1.6290393979637008, + "grad_norm": 0.3874799907207489, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 1840 + }, + { + "epoch": 1.6378928729526339, + "grad_norm": 0.3706085681915283, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 1850 + }, + { + "epoch": 1.646746347941567, + "grad_norm": 0.3997809886932373, + "learning_rate": 0.0002, + "loss": 1.6761, + "step": 1860 + }, + { + "epoch": 1.6555998229305002, + "grad_norm": 0.4033166170120239, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 1870 + }, + { + "epoch": 1.6644532979194335, + "grad_norm": 0.3944370150566101, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1880 + }, + { + "epoch": 1.6733067729083664, + "grad_norm": 0.3467825651168823, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1890 + }, + { + "epoch": 1.6821602478972997, + "grad_norm": 0.35290950536727905, + "learning_rate": 0.0002, + "loss": 1.7462, + "step": 1900 + }, + { + "epoch": 1.691013722886233, + "grad_norm": 0.3664521872997284, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1910 + }, + { + "epoch": 1.699867197875166, + "grad_norm": 0.33863595128059387, + "learning_rate": 0.0002, + "loss": 1.7922, + "step": 1920 + }, + { + "epoch": 1.7087206728640991, + "grad_norm": 0.34726113080978394, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 1930 + }, + { + "epoch": 1.7175741478530324, + "grad_norm": 0.35060688853263855, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1940 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 0.33741647005081177, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1950 + }, + { + "epoch": 1.7352810978308986, + "grad_norm": 0.36190304160118103, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1960 + }, + { + "epoch": 1.7441345728198319, + "grad_norm": 0.3412845730781555, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 1970 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 0.3841935694217682, + "learning_rate": 0.0002, + "loss": 1.7038, + "step": 1980 + }, + { + "epoch": 1.761841522797698, + "grad_norm": 0.39062076807022095, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 1990 + }, + { + "epoch": 1.7706949977866313, + "grad_norm": 0.3741697669029236, + "learning_rate": 0.0002, + "loss": 1.7346, + "step": 2000 + }, + { + "epoch": 1.7795484727755644, + "grad_norm": 0.4160231053829193, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2010 + }, + { + "epoch": 1.7884019477644975, + "grad_norm": 0.3602111339569092, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 2020 + }, + { + "epoch": 1.7972554227534308, + "grad_norm": 0.36740878224372864, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2030 + }, + { + "epoch": 1.8061088977423638, + "grad_norm": 0.419039249420166, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 2040 + }, + { + "epoch": 1.814962372731297, + "grad_norm": 0.3511838912963867, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 2050 + }, + { + "epoch": 1.8238158477202302, + "grad_norm": 0.3580166697502136, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2060 + }, + { + "epoch": 1.8326693227091635, + "grad_norm": 0.40928223729133606, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 2070 + }, + { + "epoch": 1.8415227976980963, + "grad_norm": 0.37134310603141785, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 2080 + }, + { + "epoch": 1.8503762726870296, + "grad_norm": 0.3924112319946289, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2090 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.3215042054653168, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 2100 + }, + { + "epoch": 1.868083222664896, + "grad_norm": 0.37674015760421753, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2110 + }, + { + "epoch": 1.876936697653829, + "grad_norm": 0.370856374502182, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 2120 + }, + { + "epoch": 1.8857901726427624, + "grad_norm": 0.35783782601356506, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2130 + }, + { + "epoch": 1.8946436476316955, + "grad_norm": 0.39538058638572693, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 2140 + }, + { + "epoch": 1.9034971226206285, + "grad_norm": 0.36677780747413635, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 2150 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.39032700657844543, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 2160 + }, + { + "epoch": 1.921204072598495, + "grad_norm": 0.39762043952941895, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2170 + }, + { + "epoch": 1.930057547587428, + "grad_norm": 0.5400257110595703, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 2180 + }, + { + "epoch": 1.9389110225763613, + "grad_norm": 0.3650212287902832, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2190 + }, + { + "epoch": 1.9477644975652944, + "grad_norm": 0.3583165109157562, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.9566179725542274, + "grad_norm": 0.4031282365322113, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 2210 + }, + { + "epoch": 1.9654714475431607, + "grad_norm": 0.3673221170902252, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 2220 + }, + { + "epoch": 1.9743249225320938, + "grad_norm": 0.3920327126979828, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 2230 + }, + { + "epoch": 1.9831783975210269, + "grad_norm": 0.4765491783618927, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2240 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.38130584359169006, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2250 + }, + { + "epoch": 2.0, + "eval_loss": 1.8077166080474854, + "eval_runtime": 82.8351, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 0.785, + "step": 2259 + }, + { + "epoch": 2.0008853474988935, + "grad_norm": 0.34340235590934753, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 2260 + }, + { + "epoch": 2.0097388224878263, + "grad_norm": 0.3710762858390808, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2270 + }, + { + "epoch": 2.0185922974767596, + "grad_norm": 0.35640114545822144, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2280 + }, + { + "epoch": 2.027445772465693, + "grad_norm": 0.45970189571380615, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 2290 + }, + { + "epoch": 2.0362992474546258, + "grad_norm": 0.4256797134876251, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 2300 + }, + { + "epoch": 2.045152722443559, + "grad_norm": 0.42421531677246094, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2310 + }, + { + "epoch": 2.0540061974324924, + "grad_norm": 0.4032478928565979, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2320 + }, + { + "epoch": 2.062859672421425, + "grad_norm": 0.4073623716831207, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2330 + }, + { + "epoch": 2.0717131474103585, + "grad_norm": 0.4845200777053833, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2340 + }, + { + "epoch": 2.080566622399292, + "grad_norm": 0.40578293800354004, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2350 + }, + { + "epoch": 2.089420097388225, + "grad_norm": 0.4037284255027771, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2360 + }, + { + "epoch": 2.098273572377158, + "grad_norm": 0.4717613160610199, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2370 + }, + { + "epoch": 2.1071270473660912, + "grad_norm": 0.42076411843299866, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 2380 + }, + { + "epoch": 2.1159805223550245, + "grad_norm": 0.47799113392829895, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 2390 + }, + { + "epoch": 2.1248339973439574, + "grad_norm": 0.4253084063529968, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 2400 + }, + { + "epoch": 2.1336874723328907, + "grad_norm": 0.5023085474967957, + "learning_rate": 0.0002, + "loss": 1.6432, + "step": 2410 + }, + { + "epoch": 2.142540947321824, + "grad_norm": 0.49162712693214417, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2420 + }, + { + "epoch": 2.151394422310757, + "grad_norm": 0.39035019278526306, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2430 + }, + { + "epoch": 2.16024789729969, + "grad_norm": 0.43223854899406433, + "learning_rate": 0.0002, + "loss": 1.7526, + "step": 2440 + }, + { + "epoch": 2.1691013722886234, + "grad_norm": 0.4596616327762604, + "learning_rate": 0.0002, + "loss": 1.6334, + "step": 2450 + }, + { + "epoch": 2.1779548472775563, + "grad_norm": 0.4469447731971741, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 2460 + }, + { + "epoch": 2.1868083222664896, + "grad_norm": 0.5100595355033875, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 2470 + }, + { + "epoch": 2.195661797255423, + "grad_norm": 0.4169430732727051, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2480 + }, + { + "epoch": 2.2045152722443557, + "grad_norm": 0.4699254035949707, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 2490 + }, + { + "epoch": 2.213368747233289, + "grad_norm": 0.43524250388145447, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 2500 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.4496648907661438, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2510 + }, + { + "epoch": 2.231075697211155, + "grad_norm": 0.43408212065696716, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2520 + }, + { + "epoch": 2.2399291722000885, + "grad_norm": 0.4596034288406372, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2530 + }, + { + "epoch": 2.2487826471890218, + "grad_norm": 0.5217021107673645, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2540 + }, + { + "epoch": 2.2576361221779546, + "grad_norm": 0.44745638966560364, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 2550 + }, + { + "epoch": 2.266489597166888, + "grad_norm": 0.4484798014163971, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 2560 + }, + { + "epoch": 2.275343072155821, + "grad_norm": 0.4428067207336426, + "learning_rate": 0.0002, + "loss": 1.5321, + "step": 2570 + }, + { + "epoch": 2.2841965471447545, + "grad_norm": 0.5095171332359314, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2580 + }, + { + "epoch": 2.2930500221336874, + "grad_norm": 0.44833096861839294, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 2590 + }, + { + "epoch": 2.3019034971226207, + "grad_norm": 0.507905900478363, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 2600 + }, + { + "epoch": 2.310756972111554, + "grad_norm": 0.40808171033859253, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2610 + }, + { + "epoch": 2.319610447100487, + "grad_norm": 0.4684814214706421, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2620 + }, + { + "epoch": 2.32846392208942, + "grad_norm": 0.44864922761917114, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2630 + }, + { + "epoch": 2.3373173970783534, + "grad_norm": 0.4174162745475769, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2640 + }, + { + "epoch": 2.3461708720672863, + "grad_norm": 0.42314743995666504, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 2650 + }, + { + "epoch": 2.3550243470562195, + "grad_norm": 0.49224185943603516, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2660 + }, + { + "epoch": 2.363877822045153, + "grad_norm": 0.45190292596817017, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 2670 + }, + { + "epoch": 2.3727312970340857, + "grad_norm": 0.41817107796669006, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 2680 + }, + { + "epoch": 2.381584772023019, + "grad_norm": 0.6436763405799866, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2690 + }, + { + "epoch": 2.3904382470119523, + "grad_norm": 0.47175949811935425, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2700 + }, + { + "epoch": 2.3992917220008856, + "grad_norm": 0.480339378118515, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 2710 + }, + { + "epoch": 2.4081451969898184, + "grad_norm": 0.4723486006259918, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 2720 + }, + { + "epoch": 2.4169986719787517, + "grad_norm": 0.4305492043495178, + "learning_rate": 0.0002, + "loss": 1.54, + "step": 2730 + }, + { + "epoch": 2.425852146967685, + "grad_norm": 0.5007492303848267, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2740 + }, + { + "epoch": 2.434705621956618, + "grad_norm": 0.5374062061309814, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 2750 + }, + { + "epoch": 2.443559096945551, + "grad_norm": 0.45866212248802185, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 2760 + }, + { + "epoch": 2.4524125719344845, + "grad_norm": 0.47914502024650574, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 2770 + }, + { + "epoch": 2.4612660469234173, + "grad_norm": 0.43804746866226196, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 2780 + }, + { + "epoch": 2.4701195219123506, + "grad_norm": 0.43656906485557556, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 2790 + }, + { + "epoch": 2.478972996901284, + "grad_norm": 0.4820363521575928, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 2800 + }, + { + "epoch": 2.4878264718902168, + "grad_norm": 0.4916800558567047, + "learning_rate": 0.0002, + "loss": 1.5375, + "step": 2810 + }, + { + "epoch": 2.49667994687915, + "grad_norm": 0.4521256983280182, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2820 + }, + { + "epoch": 2.5055334218680834, + "grad_norm": 0.5066806674003601, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 2830 + }, + { + "epoch": 2.514386896857016, + "grad_norm": 0.4768151640892029, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2840 + }, + { + "epoch": 2.5232403718459495, + "grad_norm": 0.5144683718681335, + "learning_rate": 0.0002, + "loss": 1.6719, + "step": 2850 + }, + { + "epoch": 2.532093846834883, + "grad_norm": 0.4718942940235138, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2860 + }, + { + "epoch": 2.5409473218238157, + "grad_norm": 0.4924587309360504, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2870 + }, + { + "epoch": 2.549800796812749, + "grad_norm": 0.4649953842163086, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2880 + }, + { + "epoch": 2.5586542718016823, + "grad_norm": 0.4836665987968445, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2890 + }, + { + "epoch": 2.567507746790615, + "grad_norm": 0.4162124991416931, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 2900 + }, + { + "epoch": 2.5763612217795484, + "grad_norm": 0.4894537925720215, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2910 + }, + { + "epoch": 2.5852146967684817, + "grad_norm": 0.4539397358894348, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 2920 + }, + { + "epoch": 2.5940681717574146, + "grad_norm": 0.4718773066997528, + "learning_rate": 0.0002, + "loss": 1.6449, + "step": 2930 + }, + { + "epoch": 2.602921646746348, + "grad_norm": 0.49989837408065796, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 2940 + }, + { + "epoch": 2.611775121735281, + "grad_norm": 0.4862406849861145, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2950 + }, + { + "epoch": 2.620628596724214, + "grad_norm": 0.4244804382324219, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 2960 + }, + { + "epoch": 2.6294820717131473, + "grad_norm": 0.49304354190826416, + "learning_rate": 0.0002, + "loss": 1.7795, + "step": 2970 + }, + { + "epoch": 2.6383355467020806, + "grad_norm": 0.4818236529827118, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 2980 + }, + { + "epoch": 2.647189021691014, + "grad_norm": 0.5077425837516785, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 2990 + }, + { + "epoch": 2.6560424966799467, + "grad_norm": 0.4494157135486603, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 3000 + }, + { + "epoch": 2.66489597166888, + "grad_norm": 0.4790278971195221, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 3010 + }, + { + "epoch": 2.6737494466578133, + "grad_norm": 0.4702624976634979, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3020 + }, + { + "epoch": 2.682602921646746, + "grad_norm": 0.5082133412361145, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3030 + }, + { + "epoch": 2.6914563966356795, + "grad_norm": 0.4553256630897522, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 3040 + }, + { + "epoch": 2.700309871624613, + "grad_norm": 0.4492715001106262, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3050 + }, + { + "epoch": 2.709163346613546, + "grad_norm": 0.4555944502353668, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 3060 + }, + { + "epoch": 2.718016821602479, + "grad_norm": 0.5879693031311035, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3070 + }, + { + "epoch": 2.7268702965914122, + "grad_norm": 0.4628562927246094, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3080 + }, + { + "epoch": 2.7357237715803455, + "grad_norm": 0.5169575810432434, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 3090 + }, + { + "epoch": 2.7445772465692784, + "grad_norm": 0.4630090892314911, + "learning_rate": 0.0002, + "loss": 1.562, + "step": 3100 + }, + { + "epoch": 2.7534307215582117, + "grad_norm": 0.5437219738960266, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 3110 + }, + { + "epoch": 2.762284196547145, + "grad_norm": 0.5102152228355408, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 3120 + }, + { + "epoch": 2.771137671536078, + "grad_norm": 0.48287826776504517, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3130 + }, + { + "epoch": 2.779991146525011, + "grad_norm": 0.4671737253665924, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 3140 + }, + { + "epoch": 2.7888446215139444, + "grad_norm": 0.5177035331726074, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3150 + }, + { + "epoch": 2.7976980965028773, + "grad_norm": 0.450989305973053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 3160 + }, + { + "epoch": 2.8065515714918106, + "grad_norm": 0.45007848739624023, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3170 + }, + { + "epoch": 2.815405046480744, + "grad_norm": 0.4600294530391693, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3180 + }, + { + "epoch": 2.8242585214696767, + "grad_norm": 0.485628604888916, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 3190 + }, + { + "epoch": 2.83311199645861, + "grad_norm": 0.49811574816703796, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 3200 + }, + { + "epoch": 2.8419654714475433, + "grad_norm": 0.5012516975402832, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 3210 + }, + { + "epoch": 2.850818946436476, + "grad_norm": 0.4552757740020752, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 3220 + }, + { + "epoch": 2.8596724214254094, + "grad_norm": 0.4539635479450226, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 3230 + }, + { + "epoch": 2.8685258964143427, + "grad_norm": 0.5534685850143433, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 3240 + }, + { + "epoch": 2.8773793714032756, + "grad_norm": 0.4570811688899994, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3250 + }, + { + "epoch": 2.886232846392209, + "grad_norm": 0.48181653022766113, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3260 + }, + { + "epoch": 2.895086321381142, + "grad_norm": 0.4871032238006592, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 3270 + }, + { + "epoch": 2.903939796370075, + "grad_norm": 0.4643239676952362, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3280 + }, + { + "epoch": 2.9127932713590083, + "grad_norm": 0.5024484395980835, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3290 + }, + { + "epoch": 2.9216467463479416, + "grad_norm": 0.4425384998321533, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 3300 + }, + { + "epoch": 2.9305002213368745, + "grad_norm": 0.459168016910553, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 3310 + }, + { + "epoch": 2.939353696325808, + "grad_norm": 0.4950717091560364, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 3320 + }, + { + "epoch": 2.948207171314741, + "grad_norm": 0.4516230523586273, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3330 + }, + { + "epoch": 2.957060646303674, + "grad_norm": 0.49523285031318665, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3340 + }, + { + "epoch": 2.9659141212926072, + "grad_norm": 0.49282631278038025, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 3350 + }, + { + "epoch": 2.9747675962815405, + "grad_norm": 0.45825016498565674, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 3360 + }, + { + "epoch": 2.983621071270474, + "grad_norm": 0.4952891170978546, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 3370 + }, + { + "epoch": 2.9924745462594067, + "grad_norm": 0.42182639241218567, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3380 + }, + { + "epoch": 2.9995573262505535, + "eval_loss": 1.8308420181274414, + "eval_runtime": 82.786, + "eval_samples_per_second": 6.221, + "eval_steps_per_second": 0.785, + "step": 3388 + }, + { + "epoch": 3.00132802124834, + "grad_norm": 0.47721418738365173, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 3390 + }, + { + "epoch": 3.0101814962372733, + "grad_norm": 0.5284923911094666, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 3400 + }, + { + "epoch": 3.019034971226206, + "grad_norm": 0.5607061982154846, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 3410 + }, + { + "epoch": 3.0278884462151394, + "grad_norm": 0.5271363258361816, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 3420 + }, + { + "epoch": 3.0367419212040727, + "grad_norm": 0.48660898208618164, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 3430 + }, + { + "epoch": 3.0455953961930056, + "grad_norm": 0.5767933130264282, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 3440 + }, + { + "epoch": 3.054448871181939, + "grad_norm": 0.5591282248497009, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3450 + }, + { + "epoch": 3.063302346170872, + "grad_norm": 0.5870814323425293, + "learning_rate": 0.0002, + "loss": 1.5112, + "step": 3460 + }, + { + "epoch": 3.072155821159805, + "grad_norm": 0.4861546456813812, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 3470 + }, + { + "epoch": 3.0810092961487383, + "grad_norm": 0.5238925814628601, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 3480 + }, + { + "epoch": 3.0898627711376716, + "grad_norm": 0.5521751046180725, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 3490 + }, + { + "epoch": 3.098716246126605, + "grad_norm": 0.5816575884819031, + "learning_rate": 0.0002, + "loss": 1.4454, + "step": 3500 + }, + { + "epoch": 3.1075697211155378, + "grad_norm": 0.5281513333320618, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 3510 + }, + { + "epoch": 3.116423196104471, + "grad_norm": 0.5847303867340088, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 3520 + }, + { + "epoch": 3.1252766710934043, + "grad_norm": 0.5683517456054688, + "learning_rate": 0.0002, + "loss": 1.5513, + "step": 3530 + }, + { + "epoch": 3.134130146082337, + "grad_norm": 0.5177015662193298, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 3540 + }, + { + "epoch": 3.1429836210712705, + "grad_norm": 0.5922423601150513, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 3550 + }, + { + "epoch": 3.151837096060204, + "grad_norm": 0.7018587589263916, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 3560 + }, + { + "epoch": 3.1606905710491366, + "grad_norm": 0.6152004599571228, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 3570 + }, + { + "epoch": 3.16954404603807, + "grad_norm": 0.5350717902183533, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 3580 + }, + { + "epoch": 3.1783975210270032, + "grad_norm": 0.5971009731292725, + "learning_rate": 0.0002, + "loss": 1.4739, + "step": 3590 + }, + { + "epoch": 3.187250996015936, + "grad_norm": 0.7312001585960388, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3600 + }, + { + "epoch": 3.1961044710048694, + "grad_norm": 0.6372535228729248, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 3610 + }, + { + "epoch": 3.2049579459938027, + "grad_norm": 0.6098020672798157, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 3620 + }, + { + "epoch": 3.2138114209827355, + "grad_norm": 0.5506435632705688, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3630 + }, + { + "epoch": 3.222664895971669, + "grad_norm": 0.6043022274971008, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3640 + }, + { + "epoch": 3.231518370960602, + "grad_norm": 0.5495519042015076, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3650 + }, + { + "epoch": 3.240371845949535, + "grad_norm": 0.5769572257995605, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3660 + }, + { + "epoch": 3.2492253209384683, + "grad_norm": 0.6833786964416504, + "learning_rate": 0.0002, + "loss": 1.4604, + "step": 3670 + }, + { + "epoch": 3.2580787959274016, + "grad_norm": 0.6962856650352478, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3680 + }, + { + "epoch": 3.2669322709163344, + "grad_norm": 0.6553098559379578, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 3690 + }, + { + "epoch": 3.2757857459052677, + "grad_norm": 0.5907557010650635, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 3700 + }, + { + "epoch": 3.284639220894201, + "grad_norm": 0.5712862014770508, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 3710 + }, + { + "epoch": 3.2934926958831343, + "grad_norm": 0.573820948600769, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 3720 + }, + { + "epoch": 3.302346170872067, + "grad_norm": 0.6650304198265076, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 3730 + }, + { + "epoch": 3.3111996458610005, + "grad_norm": 0.5182583928108215, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 3740 + }, + { + "epoch": 3.3200531208499338, + "grad_norm": 0.5078902840614319, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 3750 + }, + { + "epoch": 3.3289065958388666, + "grad_norm": 0.7062374353408813, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 3760 + }, + { + "epoch": 3.3377600708278, + "grad_norm": 0.5711262822151184, + "learning_rate": 0.0002, + "loss": 1.5017, + "step": 3770 + }, + { + "epoch": 3.346613545816733, + "grad_norm": 0.5624606013298035, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 3780 + }, + { + "epoch": 3.355467020805666, + "grad_norm": 0.6008231043815613, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 3790 + }, + { + "epoch": 3.3643204957945994, + "grad_norm": 0.6120018362998962, + "learning_rate": 0.0002, + "loss": 1.5038, + "step": 3800 + }, + { + "epoch": 3.3731739707835326, + "grad_norm": 0.5679979920387268, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3810 + }, + { + "epoch": 3.3820274457724655, + "grad_norm": 0.5613794922828674, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3820 + }, + { + "epoch": 3.390880920761399, + "grad_norm": 0.5328839421272278, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 3830 + }, + { + "epoch": 3.399734395750332, + "grad_norm": 0.5960017442703247, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 3840 + }, + { + "epoch": 3.4085878707392654, + "grad_norm": 0.5264106392860413, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 3850 + }, + { + "epoch": 3.4174413457281982, + "grad_norm": 0.6378359198570251, + "learning_rate": 0.0002, + "loss": 1.4766, + "step": 3860 + }, + { + "epoch": 3.4262948207171315, + "grad_norm": 0.5792967677116394, + "learning_rate": 0.0002, + "loss": 1.4898, + "step": 3870 + }, + { + "epoch": 3.435148295706065, + "grad_norm": 0.6836280822753906, + "learning_rate": 0.0002, + "loss": 1.4914, + "step": 3880 + }, + { + "epoch": 3.4440017706949977, + "grad_norm": 0.6073971390724182, + "learning_rate": 0.0002, + "loss": 1.5002, + "step": 3890 + }, + { + "epoch": 3.452855245683931, + "grad_norm": 0.5753195881843567, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 3900 + }, + { + "epoch": 3.4617087206728643, + "grad_norm": 0.6007646918296814, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3910 + }, + { + "epoch": 3.470562195661797, + "grad_norm": 0.6025636196136475, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3920 + }, + { + "epoch": 3.4794156706507304, + "grad_norm": 0.6819562315940857, + "learning_rate": 0.0002, + "loss": 1.4612, + "step": 3930 + }, + { + "epoch": 3.4882691456396637, + "grad_norm": 0.6448395848274231, + "learning_rate": 0.0002, + "loss": 1.518, + "step": 3940 + }, + { + "epoch": 3.4971226206285966, + "grad_norm": 0.5712178945541382, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 3950 + }, + { + "epoch": 3.50597609561753, + "grad_norm": 0.6300532817840576, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 3960 + }, + { + "epoch": 3.514829570606463, + "grad_norm": 0.6120840907096863, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 3970 + }, + { + "epoch": 3.523683045595396, + "grad_norm": 0.6887575387954712, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 3980 + }, + { + "epoch": 3.5325365205843293, + "grad_norm": 0.6970235109329224, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 3990 + }, + { + "epoch": 3.5413899955732626, + "grad_norm": 0.5818213820457458, + "learning_rate": 0.0002, + "loss": 1.5198, + "step": 4000 + }, + { + "epoch": 3.5502434705621955, + "grad_norm": 1.0533310174942017, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 4010 + }, + { + "epoch": 3.5590969455511288, + "grad_norm": 0.5444280505180359, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4020 + }, + { + "epoch": 3.567950420540062, + "grad_norm": 0.6007506847381592, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 4030 + }, + { + "epoch": 3.576803895528995, + "grad_norm": 0.6088743805885315, + "learning_rate": 0.0002, + "loss": 1.5059, + "step": 4040 + }, + { + "epoch": 3.585657370517928, + "grad_norm": 0.5934239029884338, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 4050 + }, + { + "epoch": 3.5945108455068615, + "grad_norm": 0.605251669883728, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 4060 + }, + { + "epoch": 3.6033643204957944, + "grad_norm": 0.5903469920158386, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 4070 + }, + { + "epoch": 3.6122177954847277, + "grad_norm": 0.6752413511276245, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4080 + }, + { + "epoch": 3.621071270473661, + "grad_norm": 0.5810418725013733, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4090 + }, + { + "epoch": 3.629924745462594, + "grad_norm": 0.5918573141098022, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 4100 + }, + { + "epoch": 3.638778220451527, + "grad_norm": 0.6635358333587646, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 4110 + }, + { + "epoch": 3.6476316954404604, + "grad_norm": 0.5785038471221924, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4120 + }, + { + "epoch": 3.6564851704293937, + "grad_norm": 0.5837879776954651, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4130 + }, + { + "epoch": 3.6653386454183265, + "grad_norm": 0.6449324488639832, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4140 + }, + { + "epoch": 3.67419212040726, + "grad_norm": 0.6191908717155457, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4150 + }, + { + "epoch": 3.683045595396193, + "grad_norm": 0.6937987208366394, + "learning_rate": 0.0002, + "loss": 1.4567, + "step": 4160 + }, + { + "epoch": 3.6918990703851264, + "grad_norm": 0.581128716468811, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 4170 + }, + { + "epoch": 3.7007525453740593, + "grad_norm": 0.6547803282737732, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 4180 + }, + { + "epoch": 3.7096060203629926, + "grad_norm": 0.5961150527000427, + "learning_rate": 0.0002, + "loss": 1.4653, + "step": 4190 + }, + { + "epoch": 3.718459495351926, + "grad_norm": 0.6197913885116577, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 4200 + }, + { + "epoch": 3.7273129703408587, + "grad_norm": 0.688565194606781, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 4210 + }, + { + "epoch": 3.736166445329792, + "grad_norm": 0.5832270979881287, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 4220 + }, + { + "epoch": 3.7450199203187253, + "grad_norm": 0.5643884539604187, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 4230 + }, + { + "epoch": 3.753873395307658, + "grad_norm": 0.6236484050750732, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4240 + }, + { + "epoch": 3.7627268702965915, + "grad_norm": 0.5367720127105713, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 4250 + }, + { + "epoch": 3.7715803452855248, + "grad_norm": 0.5785109400749207, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4260 + }, + { + "epoch": 3.7804338202744576, + "grad_norm": 0.5698465704917908, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4270 + }, + { + "epoch": 3.789287295263391, + "grad_norm": 0.5748036503791809, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4280 + }, + { + "epoch": 3.798140770252324, + "grad_norm": 0.608147382736206, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 4290 + }, + { + "epoch": 3.806994245241257, + "grad_norm": 0.5820456147193909, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4300 + }, + { + "epoch": 3.8158477202301904, + "grad_norm": 0.6325612664222717, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4310 + }, + { + "epoch": 3.8247011952191237, + "grad_norm": 0.6465362310409546, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4320 + }, + { + "epoch": 3.8335546702080565, + "grad_norm": 0.5630854368209839, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4330 + }, + { + "epoch": 3.84240814519699, + "grad_norm": 0.6181462407112122, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4340 + }, + { + "epoch": 3.851261620185923, + "grad_norm": 0.6207571029663086, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 4350 + }, + { + "epoch": 3.860115095174856, + "grad_norm": 0.6092919111251831, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4360 + }, + { + "epoch": 3.8689685701637893, + "grad_norm": 0.6140493750572205, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 4370 + }, + { + "epoch": 3.8778220451527226, + "grad_norm": 0.611575722694397, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 4380 + }, + { + "epoch": 3.8866755201416554, + "grad_norm": 0.6288794279098511, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 4390 + }, + { + "epoch": 3.8955289951305887, + "grad_norm": 0.6518979072570801, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4400 + }, + { + "epoch": 3.904382470119522, + "grad_norm": 0.6144753098487854, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4410 + }, + { + "epoch": 3.913235945108455, + "grad_norm": 0.7034937143325806, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 4420 + }, + { + "epoch": 3.922089420097388, + "grad_norm": 0.5713187456130981, + "learning_rate": 0.0002, + "loss": 1.4978, + "step": 4430 + }, + { + "epoch": 3.9309428950863214, + "grad_norm": 0.6187576651573181, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4440 + }, + { + "epoch": 3.9397963700752543, + "grad_norm": 0.6439383029937744, + "learning_rate": 0.0002, + "loss": 1.551, + "step": 4450 + }, + { + "epoch": 3.9486498450641876, + "grad_norm": 0.6133334636688232, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 4460 + }, + { + "epoch": 3.957503320053121, + "grad_norm": 0.593463659286499, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4470 + }, + { + "epoch": 3.9663567950420537, + "grad_norm": 0.6261998414993286, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4480 + }, + { + "epoch": 3.975210270030987, + "grad_norm": 0.6153767704963684, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4490 + }, + { + "epoch": 3.9840637450199203, + "grad_norm": 0.6184002757072449, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 4500 + }, + { + "epoch": 3.9929172200088536, + "grad_norm": 0.5212734341621399, + "learning_rate": 0.0002, + "loss": 1.5134, + "step": 4510 + }, + { + "epoch": 4.0, + "eval_loss": 1.8745536804199219, + "eval_runtime": 83.0125, + "eval_samples_per_second": 6.204, + "eval_steps_per_second": 0.783, + "step": 4518 + }, + { + "epoch": 4.001770694997787, + "grad_norm": 0.5871603488922119, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4520 + }, + { + "epoch": 4.01062416998672, + "grad_norm": 0.6746091842651367, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 4530 + }, + { + "epoch": 4.019477644975653, + "grad_norm": 0.6159639358520508, + "learning_rate": 0.0002, + "loss": 1.3625, + "step": 4540 + }, + { + "epoch": 4.028331119964586, + "grad_norm": 0.7529398202896118, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 4550 + }, + { + "epoch": 4.037184594953519, + "grad_norm": 0.788398027420044, + "learning_rate": 0.0002, + "loss": 1.3202, + "step": 4560 + }, + { + "epoch": 4.046038069942452, + "grad_norm": 0.9679850935935974, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 4570 + }, + { + "epoch": 4.054891544931386, + "grad_norm": 0.6305310130119324, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 4580 + }, + { + "epoch": 4.063745019920319, + "grad_norm": 0.8557451963424683, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 4590 + }, + { + "epoch": 4.0725984949092515, + "grad_norm": 0.741518497467041, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4600 + }, + { + "epoch": 4.081451969898185, + "grad_norm": 0.6573862433433533, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 4610 + }, + { + "epoch": 4.090305444887118, + "grad_norm": 0.6926319599151611, + "learning_rate": 0.0002, + "loss": 1.3341, + "step": 4620 + }, + { + "epoch": 4.099158919876051, + "grad_norm": 0.9212626218795776, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4630 + }, + { + "epoch": 4.108012394864985, + "grad_norm": 0.7167867422103882, + "learning_rate": 0.0002, + "loss": 1.3402, + "step": 4640 + }, + { + "epoch": 4.116865869853918, + "grad_norm": 0.6691595911979675, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 4650 + }, + { + "epoch": 4.12571934484285, + "grad_norm": 0.8708247542381287, + "learning_rate": 0.0002, + "loss": 1.247, + "step": 4660 + }, + { + "epoch": 4.134572819831784, + "grad_norm": 0.8612170219421387, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 4670 + }, + { + "epoch": 4.143426294820717, + "grad_norm": 0.7688325047492981, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 4680 + }, + { + "epoch": 4.152279769809651, + "grad_norm": 0.7606917023658752, + "learning_rate": 0.0002, + "loss": 1.4349, + "step": 4690 + }, + { + "epoch": 4.161133244798584, + "grad_norm": 0.8241282105445862, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 4700 + }, + { + "epoch": 4.1699867197875164, + "grad_norm": 0.7480464577674866, + "learning_rate": 0.0002, + "loss": 1.3325, + "step": 4710 + }, + { + "epoch": 4.17884019477645, + "grad_norm": 0.7092460989952087, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 4720 + }, + { + "epoch": 4.187693669765383, + "grad_norm": 0.8782108426094055, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4730 + }, + { + "epoch": 4.196547144754316, + "grad_norm": 0.6875300407409668, + "learning_rate": 0.0002, + "loss": 1.3626, + "step": 4740 + }, + { + "epoch": 4.20540061974325, + "grad_norm": 0.7713887691497803, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 4750 + }, + { + "epoch": 4.2142540947321825, + "grad_norm": 0.8270819783210754, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 4760 + }, + { + "epoch": 4.223107569721115, + "grad_norm": 0.7109288573265076, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 4770 + }, + { + "epoch": 4.231961044710049, + "grad_norm": 0.7209359407424927, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 4780 + }, + { + "epoch": 4.240814519698982, + "grad_norm": 0.7142833471298218, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 4790 + }, + { + "epoch": 4.249667994687915, + "grad_norm": 0.8526809811592102, + "learning_rate": 0.0002, + "loss": 1.3654, + "step": 4800 + }, + { + "epoch": 4.2585214696768485, + "grad_norm": 0.7064695954322815, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 4810 + }, + { + "epoch": 4.267374944665781, + "grad_norm": 0.7646124362945557, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 4820 + }, + { + "epoch": 4.276228419654714, + "grad_norm": 0.7377115488052368, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 4830 + }, + { + "epoch": 4.285081894643648, + "grad_norm": 0.7308453321456909, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 4840 + }, + { + "epoch": 4.293935369632581, + "grad_norm": 0.6687684059143066, + "learning_rate": 0.0002, + "loss": 1.3653, + "step": 4850 + }, + { + "epoch": 4.302788844621514, + "grad_norm": 0.7447634339332581, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 4860 + }, + { + "epoch": 4.311642319610447, + "grad_norm": 0.7661601305007935, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 4870 + }, + { + "epoch": 4.32049579459938, + "grad_norm": 0.7492215037345886, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 4880 + }, + { + "epoch": 4.329349269588313, + "grad_norm": 0.9554458856582642, + "learning_rate": 0.0002, + "loss": 1.4089, + "step": 4890 + }, + { + "epoch": 4.338202744577247, + "grad_norm": 0.7409822940826416, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 4900 + }, + { + "epoch": 4.34705621956618, + "grad_norm": 0.9848645329475403, + "learning_rate": 0.0002, + "loss": 1.2581, + "step": 4910 + }, + { + "epoch": 4.355909694555113, + "grad_norm": 0.803995668888092, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4920 + }, + { + "epoch": 4.364763169544046, + "grad_norm": 0.7480606436729431, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 4930 + }, + { + "epoch": 4.373616644532979, + "grad_norm": 0.7018141150474548, + "learning_rate": 0.0002, + "loss": 1.4092, + "step": 4940 + }, + { + "epoch": 4.382470119521912, + "grad_norm": 0.7684932351112366, + "learning_rate": 0.0002, + "loss": 1.4034, + "step": 4950 + }, + { + "epoch": 4.391323594510846, + "grad_norm": 0.7849185466766357, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 4960 + }, + { + "epoch": 4.400177069499779, + "grad_norm": 0.7858862280845642, + "learning_rate": 0.0002, + "loss": 1.3763, + "step": 4970 + }, + { + "epoch": 4.4090305444887115, + "grad_norm": 0.8270778059959412, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4980 + }, + { + "epoch": 4.417884019477645, + "grad_norm": 0.8464101552963257, + "learning_rate": 0.0002, + "loss": 1.445, + "step": 4990 + }, + { + "epoch": 4.426737494466578, + "grad_norm": 0.85670405626297, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5000 + }, + { + "epoch": 4.435590969455511, + "grad_norm": 0.8656655550003052, + "learning_rate": 0.0002, + "loss": 1.4203, + "step": 5010 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7605292201042175, + "learning_rate": 0.0002, + "loss": 1.3426, + "step": 5020 + }, + { + "epoch": 4.4532979194333775, + "grad_norm": 0.7682471871376038, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 5030 + }, + { + "epoch": 4.46215139442231, + "grad_norm": 0.7209102511405945, + "learning_rate": 0.0002, + "loss": 1.3432, + "step": 5040 + }, + { + "epoch": 4.471004869411244, + "grad_norm": 0.8259989023208618, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 5050 + }, + { + "epoch": 4.479858344400177, + "grad_norm": 0.7342197895050049, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 5060 + }, + { + "epoch": 4.48871181938911, + "grad_norm": 0.7869040369987488, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 5070 + }, + { + "epoch": 4.4975652943780435, + "grad_norm": 0.7906143665313721, + "learning_rate": 0.0002, + "loss": 1.3734, + "step": 5080 + }, + { + "epoch": 4.506418769366976, + "grad_norm": 0.7336861491203308, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 5090 + }, + { + "epoch": 4.515272244355909, + "grad_norm": 0.8264166712760925, + "learning_rate": 0.0002, + "loss": 1.3768, + "step": 5100 + }, + { + "epoch": 4.524125719344843, + "grad_norm": 0.8144693970680237, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 5110 + }, + { + "epoch": 4.532979194333776, + "grad_norm": 0.8257269263267517, + "learning_rate": 0.0002, + "loss": 1.3044, + "step": 5120 + }, + { + "epoch": 4.541832669322709, + "grad_norm": 0.8838174343109131, + "learning_rate": 0.0002, + "loss": 1.3501, + "step": 5130 + }, + { + "epoch": 4.550686144311642, + "grad_norm": 0.7081145644187927, + "learning_rate": 0.0002, + "loss": 1.3464, + "step": 5140 + }, + { + "epoch": 4.559539619300575, + "grad_norm": 0.7137823700904846, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 5150 + }, + { + "epoch": 4.568393094289509, + "grad_norm": 0.7890386581420898, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 5160 + }, + { + "epoch": 4.577246569278442, + "grad_norm": 0.6418015360832214, + "learning_rate": 0.0002, + "loss": 1.3368, + "step": 5170 + }, + { + "epoch": 4.586100044267375, + "grad_norm": 0.768373966217041, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 5180 + }, + { + "epoch": 4.5949535192563085, + "grad_norm": 0.6934067606925964, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5190 + }, + { + "epoch": 4.603806994245241, + "grad_norm": 0.9430719017982483, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 5200 + }, + { + "epoch": 4.612660469234174, + "grad_norm": 0.880264163017273, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 5210 + }, + { + "epoch": 4.621513944223108, + "grad_norm": 0.7584623098373413, + "learning_rate": 0.0002, + "loss": 1.3506, + "step": 5220 + }, + { + "epoch": 4.630367419212041, + "grad_norm": 0.7974506616592407, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 5230 + }, + { + "epoch": 4.639220894200974, + "grad_norm": 0.8812133073806763, + "learning_rate": 0.0002, + "loss": 1.3818, + "step": 5240 + }, + { + "epoch": 4.648074369189907, + "grad_norm": 0.8968724012374878, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 5250 + }, + { + "epoch": 4.65692784417884, + "grad_norm": 0.7317764759063721, + "learning_rate": 0.0002, + "loss": 1.3327, + "step": 5260 + }, + { + "epoch": 4.665781319167773, + "grad_norm": 0.7415484189987183, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 5270 + }, + { + "epoch": 4.674634794156707, + "grad_norm": 0.7867009043693542, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 5280 + }, + { + "epoch": 4.68348826914564, + "grad_norm": 0.6895416378974915, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 5290 + }, + { + "epoch": 4.6923417441345725, + "grad_norm": 0.7324506640434265, + "learning_rate": 0.0002, + "loss": 1.3438, + "step": 5300 + }, + { + "epoch": 4.701195219123506, + "grad_norm": 0.7383193969726562, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 5310 + }, + { + "epoch": 4.710048694112439, + "grad_norm": 0.8254916071891785, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 5320 + }, + { + "epoch": 4.718902169101372, + "grad_norm": 0.8161033987998962, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 5330 + }, + { + "epoch": 4.727755644090306, + "grad_norm": 0.7664386034011841, + "learning_rate": 0.0002, + "loss": 1.3623, + "step": 5340 + }, + { + "epoch": 4.7366091190792385, + "grad_norm": 0.7465475797653198, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 5350 + }, + { + "epoch": 4.745462594068171, + "grad_norm": 0.7810078263282776, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 5360 + }, + { + "epoch": 4.754316069057105, + "grad_norm": 0.7428439855575562, + "learning_rate": 0.0002, + "loss": 1.4489, + "step": 5370 + }, + { + "epoch": 4.763169544046038, + "grad_norm": 0.9548320174217224, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 5380 + }, + { + "epoch": 4.772023019034972, + "grad_norm": 0.7959533333778381, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 5390 + }, + { + "epoch": 4.780876494023905, + "grad_norm": 0.747473418712616, + "learning_rate": 0.0002, + "loss": 1.3448, + "step": 5400 + }, + { + "epoch": 4.789729969012837, + "grad_norm": 0.7863122820854187, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 5410 + }, + { + "epoch": 4.798583444001771, + "grad_norm": 0.7769626379013062, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 5420 + }, + { + "epoch": 4.807436918990704, + "grad_norm": 0.8551191091537476, + "learning_rate": 0.0002, + "loss": 1.4484, + "step": 5430 + }, + { + "epoch": 4.816290393979637, + "grad_norm": 0.8364850878715515, + "learning_rate": 0.0002, + "loss": 1.4314, + "step": 5440 + }, + { + "epoch": 4.825143868968571, + "grad_norm": 0.7458856701850891, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 5450 + }, + { + "epoch": 4.8339973439575035, + "grad_norm": 0.7558291554450989, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 5460 + }, + { + "epoch": 4.842850818946436, + "grad_norm": 0.8396534323692322, + "learning_rate": 0.0002, + "loss": 1.3343, + "step": 5470 + }, + { + "epoch": 4.85170429393537, + "grad_norm": 0.7790794968605042, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 5480 + }, + { + "epoch": 4.860557768924303, + "grad_norm": 0.8607641458511353, + "learning_rate": 0.0002, + "loss": 1.406, + "step": 5490 + }, + { + "epoch": 4.869411243913236, + "grad_norm": 0.828134298324585, + "learning_rate": 0.0002, + "loss": 1.4011, + "step": 5500 + }, + { + "epoch": 4.8782647189021695, + "grad_norm": 0.8783106803894043, + "learning_rate": 0.0002, + "loss": 1.4089, + "step": 5510 + }, + { + "epoch": 4.887118193891102, + "grad_norm": 0.7476183176040649, + "learning_rate": 0.0002, + "loss": 1.4565, + "step": 5520 + }, + { + "epoch": 4.895971668880035, + "grad_norm": 0.8023254871368408, + "learning_rate": 0.0002, + "loss": 1.3974, + "step": 5530 + }, + { + "epoch": 4.904825143868969, + "grad_norm": 0.8021706938743591, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 5540 + }, + { + "epoch": 4.913678618857902, + "grad_norm": 0.7873618602752686, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 5550 + }, + { + "epoch": 4.922532093846835, + "grad_norm": 0.7181428670883179, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 5560 + }, + { + "epoch": 4.931385568835768, + "grad_norm": 0.7464273571968079, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 5570 + }, + { + "epoch": 4.940239043824701, + "grad_norm": 0.7433671355247498, + "learning_rate": 0.0002, + "loss": 1.3184, + "step": 5580 + }, + { + "epoch": 4.949092518813634, + "grad_norm": 0.7571114301681519, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 5590 + }, + { + "epoch": 4.957945993802568, + "grad_norm": 0.7811630964279175, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 5600 + }, + { + "epoch": 4.966799468791501, + "grad_norm": 0.7609148621559143, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 5610 + }, + { + "epoch": 4.9756529437804335, + "grad_norm": 0.7324382066726685, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 5620 + }, + { + "epoch": 4.984506418769367, + "grad_norm": 0.9249559640884399, + "learning_rate": 0.0002, + "loss": 1.4557, + "step": 5630 + }, + { + "epoch": 4.9933598937583, + "grad_norm": 0.7852522134780884, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 5640 + }, + { + "epoch": 4.999557326250553, + "eval_loss": 1.9384633302688599, + "eval_runtime": 82.6042, + "eval_samples_per_second": 6.235, + "eval_steps_per_second": 0.787, + "step": 5647 + } + ], + "logging_steps": 10, + "max_steps": 9032, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6135365473533952e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-5647/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..43e886039d94d27b5bba48ebccc67e1197501f73 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807a88a62618c3ab54166c8738c19d305727de63f2e8cec4054d95ae71053fa0 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a458d9edce1fcbd175c4fab656371a436ef0245 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:593a05ec3fc1889dd603c3b9443198188c5e9c39fa58d20411f8ca3e7b7ef30c +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..739e83d9263effb6d3c9c641f122c188c4030807 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0abe104fcd93d1ebbbcfe9fb0c7aef3e2b911c3e46e57240ac9c6723353f41c1 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc175c9ed6abade667078c86d7a00ab44c3f6763 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec213c64f7341b20bae9c8bcc6128b96e389ddb85222d52f2e73969feaa4fcf9 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a72cdc3af729e32fc301b56235b6e2f9c33c5e2e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/trainer_state.json @@ -0,0 +1,4820 @@ +{ + "best_metric": 1.8077166080474854, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 6777, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008853474988933156, + "grad_norm": 0.4775333106517792, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 10 + }, + { + "epoch": 0.017706949977866312, + "grad_norm": 0.5485824346542358, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 20 + }, + { + "epoch": 0.02656042496679947, + "grad_norm": 0.5675218105316162, + "learning_rate": 0.0002, + "loss": 2.0984, + "step": 30 + }, + { + "epoch": 0.035413899955732624, + "grad_norm": 0.696494460105896, + "learning_rate": 0.0002, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.04426737494466578, + "grad_norm": 0.4788398742675781, + "learning_rate": 0.0002, + "loss": 1.9547, + "step": 50 + }, + { + "epoch": 0.05312084993359894, + "grad_norm": 0.4763128161430359, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 60 + }, + { + "epoch": 0.0619743249225321, + "grad_norm": 0.5929698348045349, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 70 + }, + { + "epoch": 0.07082779991146525, + "grad_norm": 0.5899396538734436, + "learning_rate": 0.0002, + "loss": 1.9573, + "step": 80 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.460123747587204, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 90 + }, + { + "epoch": 0.08853474988933156, + "grad_norm": 0.4184812009334564, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 100 + }, + { + "epoch": 0.09738822487826472, + "grad_norm": 0.4051891267299652, + "learning_rate": 0.0002, + "loss": 1.8079, + "step": 110 + }, + { + "epoch": 0.10624169986719788, + "grad_norm": 0.3709661066532135, + "learning_rate": 0.0002, + "loss": 1.8911, + "step": 120 + }, + { + "epoch": 0.11509517485613104, + "grad_norm": 0.4783487915992737, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 130 + }, + { + "epoch": 0.1239486498450642, + "grad_norm": 0.36478137969970703, + "learning_rate": 0.0002, + "loss": 1.8602, + "step": 140 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.4005294442176819, + "learning_rate": 0.0002, + "loss": 1.7814, + "step": 150 + }, + { + "epoch": 0.1416555998229305, + "grad_norm": 0.42357513308525085, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 160 + }, + { + "epoch": 0.15050907481186365, + "grad_norm": 0.3913971781730652, + "learning_rate": 0.0002, + "loss": 1.8835, + "step": 170 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.4650019407272339, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 180 + }, + { + "epoch": 0.16821602478972997, + "grad_norm": 0.5545958876609802, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 190 + }, + { + "epoch": 0.17706949977866313, + "grad_norm": 0.3669356107711792, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 200 + }, + { + "epoch": 0.18592297476759628, + "grad_norm": 0.3683622181415558, + "learning_rate": 0.0002, + "loss": 1.8169, + "step": 210 + }, + { + "epoch": 0.19477644975652944, + "grad_norm": 0.39825671911239624, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 220 + }, + { + "epoch": 0.2036299247454626, + "grad_norm": 0.4298318326473236, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 230 + }, + { + "epoch": 0.21248339973439576, + "grad_norm": 0.36111244559288025, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 240 + }, + { + "epoch": 0.2213368747233289, + "grad_norm": 0.3711858093738556, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 250 + }, + { + "epoch": 0.23019034971226207, + "grad_norm": 0.37717559933662415, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 260 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.3678877651691437, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 270 + }, + { + "epoch": 0.2478972996901284, + "grad_norm": 0.4165912866592407, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 280 + }, + { + "epoch": 0.25675077467906154, + "grad_norm": 0.3403240740299225, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 290 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.4023234248161316, + "learning_rate": 0.0002, + "loss": 1.8704, + "step": 300 + }, + { + "epoch": 0.27445772465692786, + "grad_norm": 0.32472360134124756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 310 + }, + { + "epoch": 0.283311199645861, + "grad_norm": 0.36464595794677734, + "learning_rate": 0.0002, + "loss": 1.8544, + "step": 320 + }, + { + "epoch": 0.2921646746347942, + "grad_norm": 0.3868598937988281, + "learning_rate": 0.0002, + "loss": 1.8168, + "step": 330 + }, + { + "epoch": 0.3010181496237273, + "grad_norm": 0.3123539686203003, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 340 + }, + { + "epoch": 0.3098716246126605, + "grad_norm": 0.3392639458179474, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 350 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.42070651054382324, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 360 + }, + { + "epoch": 0.3275785745905268, + "grad_norm": 0.3650900423526764, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 370 + }, + { + "epoch": 0.33643204957945994, + "grad_norm": 0.41388973593711853, + "learning_rate": 0.0002, + "loss": 1.8388, + "step": 380 + }, + { + "epoch": 0.3452855245683931, + "grad_norm": 0.36625272035598755, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 390 + }, + { + "epoch": 0.35413899955732625, + "grad_norm": 0.3930284082889557, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 400 + }, + { + "epoch": 0.3629924745462594, + "grad_norm": 0.3415820300579071, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 410 + }, + { + "epoch": 0.37184594953519257, + "grad_norm": 0.4256570041179657, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 420 + }, + { + "epoch": 0.3806994245241257, + "grad_norm": 0.3740842938423157, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 430 + }, + { + "epoch": 0.3895528995130589, + "grad_norm": 0.334108829498291, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 440 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.33186739683151245, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 450 + }, + { + "epoch": 0.4072598494909252, + "grad_norm": 0.39127954840660095, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 460 + }, + { + "epoch": 0.4161133244798583, + "grad_norm": 0.331443727016449, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 470 + }, + { + "epoch": 0.4249667994687915, + "grad_norm": 0.36834150552749634, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 480 + }, + { + "epoch": 0.43382027445772464, + "grad_norm": 0.338123619556427, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 490 + }, + { + "epoch": 0.4426737494466578, + "grad_norm": 0.3891060948371887, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 500 + }, + { + "epoch": 0.45152722443559096, + "grad_norm": 0.3486529290676117, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 510 + }, + { + "epoch": 0.46038069942452414, + "grad_norm": 0.3635135889053345, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 520 + }, + { + "epoch": 0.4692341744134573, + "grad_norm": 0.7706693410873413, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 530 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.33725443482398987, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 540 + }, + { + "epoch": 0.4869411243913236, + "grad_norm": 0.3127504289150238, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 550 + }, + { + "epoch": 0.4957945993802568, + "grad_norm": 0.3527977466583252, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 560 + }, + { + "epoch": 0.5046480743691899, + "grad_norm": 0.3574548661708832, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 570 + }, + { + "epoch": 0.5135015493581231, + "grad_norm": 0.32787248492240906, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 580 + }, + { + "epoch": 0.5223550243470563, + "grad_norm": 0.3309430778026581, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 590 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 0.34276407957077026, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 600 + }, + { + "epoch": 0.5400619743249225, + "grad_norm": 0.3343711495399475, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 610 + }, + { + "epoch": 0.5489154493138557, + "grad_norm": 0.3193040192127228, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 620 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.3059828579425812, + "learning_rate": 0.0002, + "loss": 1.7769, + "step": 630 + }, + { + "epoch": 0.566622399291722, + "grad_norm": 0.37237173318862915, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 640 + }, + { + "epoch": 0.5754758742806552, + "grad_norm": 0.36022549867630005, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 650 + }, + { + "epoch": 0.5843293492695883, + "grad_norm": 0.34974920749664307, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 660 + }, + { + "epoch": 0.5931828242585214, + "grad_norm": 0.37135401368141174, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.6020362992474546, + "grad_norm": 0.3385699689388275, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 680 + }, + { + "epoch": 0.6108897742363878, + "grad_norm": 0.36015814542770386, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 690 + }, + { + "epoch": 0.619743249225321, + "grad_norm": 0.3503795564174652, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 700 + }, + { + "epoch": 0.628596724214254, + "grad_norm": 0.3447190225124359, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 710 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.3193499445915222, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 720 + }, + { + "epoch": 0.6463036741921204, + "grad_norm": 0.37058180570602417, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 730 + }, + { + "epoch": 0.6551571491810536, + "grad_norm": 0.42216411232948303, + "learning_rate": 0.0002, + "loss": 1.8391, + "step": 740 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.3091185688972473, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 750 + }, + { + "epoch": 0.6728640991589199, + "grad_norm": 0.33168601989746094, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 760 + }, + { + "epoch": 0.6817175741478531, + "grad_norm": 0.31269341707229614, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 770 + }, + { + "epoch": 0.6905710491367862, + "grad_norm": 0.36125293374061584, + "learning_rate": 0.0002, + "loss": 1.8526, + "step": 780 + }, + { + "epoch": 0.6994245241257193, + "grad_norm": 0.3145293593406677, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 790 + }, + { + "epoch": 0.7082779991146525, + "grad_norm": 0.3611990809440613, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.3165971636772156, + "learning_rate": 0.0002, + "loss": 1.892, + "step": 810 + }, + { + "epoch": 0.7259849490925188, + "grad_norm": 0.3364323675632477, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 820 + }, + { + "epoch": 0.734838424081452, + "grad_norm": 0.4310600757598877, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 830 + }, + { + "epoch": 0.7436918990703851, + "grad_norm": 0.3414389491081238, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 840 + }, + { + "epoch": 0.7525453740593183, + "grad_norm": 0.35536202788352966, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 850 + }, + { + "epoch": 0.7613988490482514, + "grad_norm": 0.3232460618019104, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 860 + }, + { + "epoch": 0.7702523240371846, + "grad_norm": 0.32734858989715576, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 870 + }, + { + "epoch": 0.7791057990261178, + "grad_norm": 0.3433493673801422, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 880 + }, + { + "epoch": 0.787959274015051, + "grad_norm": 0.33354780077934265, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 890 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.30728545784950256, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 900 + }, + { + "epoch": 0.8056662239929172, + "grad_norm": 0.3373030126094818, + "learning_rate": 0.0002, + "loss": 1.8267, + "step": 910 + }, + { + "epoch": 0.8145196989818504, + "grad_norm": 0.3468782603740692, + "learning_rate": 0.0002, + "loss": 1.8479, + "step": 920 + }, + { + "epoch": 0.8233731739707836, + "grad_norm": 0.33520200848579407, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 930 + }, + { + "epoch": 0.8322266489597167, + "grad_norm": 0.35207098722457886, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 0.8410801239486498, + "grad_norm": 0.4000207483768463, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 950 + }, + { + "epoch": 0.849933598937583, + "grad_norm": 0.35362836718559265, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 960 + }, + { + "epoch": 0.8587870739265162, + "grad_norm": 0.3470745086669922, + "learning_rate": 0.0002, + "loss": 1.7497, + "step": 970 + }, + { + "epoch": 0.8676405489154493, + "grad_norm": 0.31602704524993896, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 980 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.3062942326068878, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 990 + }, + { + "epoch": 0.8853474988933157, + "grad_norm": 0.36963850259780884, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1000 + }, + { + "epoch": 0.8942009738822487, + "grad_norm": 0.3384034037590027, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1010 + }, + { + "epoch": 0.9030544488711819, + "grad_norm": 0.30436110496520996, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 1020 + }, + { + "epoch": 0.9119079238601151, + "grad_norm": 3.499784469604492, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1030 + }, + { + "epoch": 0.9207613988490483, + "grad_norm": 0.3130280375480652, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1040 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.29976674914360046, + "learning_rate": 0.0002, + "loss": 1.7527, + "step": 1050 + }, + { + "epoch": 0.9384683488269145, + "grad_norm": 0.35852617025375366, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 1060 + }, + { + "epoch": 0.9473218238158477, + "grad_norm": 0.3288591504096985, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1070 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.32641634345054626, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1080 + }, + { + "epoch": 0.965028773793714, + "grad_norm": 0.3305715322494507, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1090 + }, + { + "epoch": 0.9738822487826472, + "grad_norm": 0.30650773644447327, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1100 + }, + { + "epoch": 0.9827357237715804, + "grad_norm": 0.3330624997615814, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1110 + }, + { + "epoch": 0.9915891987605135, + "grad_norm": 0.3173314034938812, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 1120 + }, + { + "epoch": 0.9995573262505534, + "eval_loss": 1.8095673322677612, + "eval_runtime": 82.6312, + "eval_samples_per_second": 6.233, + "eval_steps_per_second": 0.787, + "step": 1129 + }, + { + "epoch": 1.0004426737494467, + "grad_norm": 0.3092995882034302, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1130 + }, + { + "epoch": 1.0092961487383798, + "grad_norm": 0.34386494755744934, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1140 + }, + { + "epoch": 1.0181496237273129, + "grad_norm": 0.2887897789478302, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 1150 + }, + { + "epoch": 1.0270030987162462, + "grad_norm": 0.3706893026828766, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1160 + }, + { + "epoch": 1.0358565737051793, + "grad_norm": 0.34724316000938416, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 1170 + }, + { + "epoch": 1.0447100486941125, + "grad_norm": 0.41001757979393005, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1180 + }, + { + "epoch": 1.0535635236830456, + "grad_norm": 0.34838348627090454, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 1190 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 0.37201181054115295, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1200 + }, + { + "epoch": 1.071270473660912, + "grad_norm": 0.36871352791786194, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1210 + }, + { + "epoch": 1.080123948649845, + "grad_norm": 0.35687458515167236, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 1220 + }, + { + "epoch": 1.0889774236387781, + "grad_norm": 0.3864741921424866, + "learning_rate": 0.0002, + "loss": 1.7235, + "step": 1230 + }, + { + "epoch": 1.0978308986277114, + "grad_norm": 0.3496808707714081, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1240 + }, + { + "epoch": 1.1066843736166445, + "grad_norm": 0.3444930911064148, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 1250 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.353188693523407, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1260 + }, + { + "epoch": 1.1243913235945109, + "grad_norm": 0.3284400999546051, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1270 + }, + { + "epoch": 1.133244798583444, + "grad_norm": 0.3545348644256592, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1280 + }, + { + "epoch": 1.1420982735723773, + "grad_norm": 0.3489900529384613, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1290 + }, + { + "epoch": 1.1509517485613103, + "grad_norm": 0.40355560183525085, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 1300 + }, + { + "epoch": 1.1598052235502434, + "grad_norm": 0.3369944095611572, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 1310 + }, + { + "epoch": 1.1686586985391767, + "grad_norm": 0.39141345024108887, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1320 + }, + { + "epoch": 1.1775121735281098, + "grad_norm": 0.36518552899360657, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 1330 + }, + { + "epoch": 1.1863656485170428, + "grad_norm": 0.3730056583881378, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1340 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 0.37711501121520996, + "learning_rate": 0.0002, + "loss": 1.7613, + "step": 1350 + }, + { + "epoch": 1.2040725984949092, + "grad_norm": 0.3627128005027771, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 1360 + }, + { + "epoch": 1.2129260734838425, + "grad_norm": 0.3458651006221771, + "learning_rate": 0.0002, + "loss": 1.7214, + "step": 1370 + }, + { + "epoch": 1.2217795484727756, + "grad_norm": 0.392395555973053, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1380 + }, + { + "epoch": 1.2306330234617087, + "grad_norm": 0.3353286683559418, + "learning_rate": 0.0002, + "loss": 1.7785, + "step": 1390 + }, + { + "epoch": 1.239486498450642, + "grad_norm": 0.9545007944107056, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 1400 + }, + { + "epoch": 1.248339973439575, + "grad_norm": 0.37037935853004456, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1410 + }, + { + "epoch": 1.257193448428508, + "grad_norm": 0.3831497132778168, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1420 + }, + { + "epoch": 1.2660469234174414, + "grad_norm": 0.4633576273918152, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 1430 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.3690567910671234, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 1440 + }, + { + "epoch": 1.2837538733953076, + "grad_norm": 0.33980098366737366, + "learning_rate": 0.0002, + "loss": 1.767, + "step": 1450 + }, + { + "epoch": 1.2926073483842409, + "grad_norm": 0.3731277287006378, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1460 + }, + { + "epoch": 1.301460823373174, + "grad_norm": 0.3781551122665405, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 1470 + }, + { + "epoch": 1.310314298362107, + "grad_norm": 0.36511561274528503, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1480 + }, + { + "epoch": 1.3191677733510403, + "grad_norm": 0.3292245864868164, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1490 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 0.38758566975593567, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1500 + }, + { + "epoch": 1.3368747233289067, + "grad_norm": 0.3993414044380188, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1510 + }, + { + "epoch": 1.3457281983178397, + "grad_norm": 0.35689303278923035, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1520 + }, + { + "epoch": 1.354581673306773, + "grad_norm": 0.41849321126937866, + "learning_rate": 0.0002, + "loss": 1.7082, + "step": 1530 + }, + { + "epoch": 1.3634351482957061, + "grad_norm": 0.36752554774284363, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1540 + }, + { + "epoch": 1.3722886232846392, + "grad_norm": 0.36915940046310425, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 1550 + }, + { + "epoch": 1.3811420982735725, + "grad_norm": 0.3656710386276245, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1560 + }, + { + "epoch": 1.3899955732625056, + "grad_norm": 0.32055532932281494, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 1570 + }, + { + "epoch": 1.3988490482514386, + "grad_norm": 0.35031241178512573, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1580 + }, + { + "epoch": 1.407702523240372, + "grad_norm": 0.44541189074516296, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 1590 + }, + { + "epoch": 1.416555998229305, + "grad_norm": 0.36922356486320496, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 1600 + }, + { + "epoch": 1.425409473218238, + "grad_norm": 0.3470565974712372, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 1610 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.3743111193180084, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1620 + }, + { + "epoch": 1.4431164231961044, + "grad_norm": 0.3619250953197479, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1630 + }, + { + "epoch": 1.4519698981850375, + "grad_norm": 0.4028145968914032, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 1640 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.36065351963043213, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1650 + }, + { + "epoch": 1.469676848162904, + "grad_norm": 0.44304442405700684, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 1660 + }, + { + "epoch": 1.478530323151837, + "grad_norm": 0.35770007967948914, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1670 + }, + { + "epoch": 1.4873837981407703, + "grad_norm": 0.37584400177001953, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 1680 + }, + { + "epoch": 1.4962372731297033, + "grad_norm": 0.37151241302490234, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 1690 + }, + { + "epoch": 1.5050907481186364, + "grad_norm": 0.36422812938690186, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1700 + }, + { + "epoch": 1.5139442231075697, + "grad_norm": 0.3680015206336975, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1710 + }, + { + "epoch": 1.522797698096503, + "grad_norm": 0.3356926441192627, + "learning_rate": 0.0002, + "loss": 1.6917, + "step": 1720 + }, + { + "epoch": 1.531651173085436, + "grad_norm": 0.37887054681777954, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1730 + }, + { + "epoch": 1.5405046480743692, + "grad_norm": 0.37052762508392334, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1740 + }, + { + "epoch": 1.5493581230633025, + "grad_norm": 0.333925724029541, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 1750 + }, + { + "epoch": 1.5582115980522355, + "grad_norm": 0.3722778558731079, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1760 + }, + { + "epoch": 1.5670650730411686, + "grad_norm": 0.3331141173839569, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 1770 + }, + { + "epoch": 1.575918548030102, + "grad_norm": 0.3670045733451843, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1780 + }, + { + "epoch": 1.584772023019035, + "grad_norm": 0.3769885301589966, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1790 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.4266890287399292, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1800 + }, + { + "epoch": 1.6024789729969013, + "grad_norm": 0.37174347043037415, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1810 + }, + { + "epoch": 1.6113324479858344, + "grad_norm": 0.3599846363067627, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1820 + }, + { + "epoch": 1.6201859229747675, + "grad_norm": 0.3364820182323456, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1830 + }, + { + "epoch": 1.6290393979637008, + "grad_norm": 0.3874799907207489, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 1840 + }, + { + "epoch": 1.6378928729526339, + "grad_norm": 0.3706085681915283, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 1850 + }, + { + "epoch": 1.646746347941567, + "grad_norm": 0.3997809886932373, + "learning_rate": 0.0002, + "loss": 1.6761, + "step": 1860 + }, + { + "epoch": 1.6555998229305002, + "grad_norm": 0.4033166170120239, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 1870 + }, + { + "epoch": 1.6644532979194335, + "grad_norm": 0.3944370150566101, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1880 + }, + { + "epoch": 1.6733067729083664, + "grad_norm": 0.3467825651168823, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1890 + }, + { + "epoch": 1.6821602478972997, + "grad_norm": 0.35290950536727905, + "learning_rate": 0.0002, + "loss": 1.7462, + "step": 1900 + }, + { + "epoch": 1.691013722886233, + "grad_norm": 0.3664521872997284, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1910 + }, + { + "epoch": 1.699867197875166, + "grad_norm": 0.33863595128059387, + "learning_rate": 0.0002, + "loss": 1.7922, + "step": 1920 + }, + { + "epoch": 1.7087206728640991, + "grad_norm": 0.34726113080978394, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 1930 + }, + { + "epoch": 1.7175741478530324, + "grad_norm": 0.35060688853263855, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1940 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 0.33741647005081177, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1950 + }, + { + "epoch": 1.7352810978308986, + "grad_norm": 0.36190304160118103, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1960 + }, + { + "epoch": 1.7441345728198319, + "grad_norm": 0.3412845730781555, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 1970 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 0.3841935694217682, + "learning_rate": 0.0002, + "loss": 1.7038, + "step": 1980 + }, + { + "epoch": 1.761841522797698, + "grad_norm": 0.39062076807022095, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 1990 + }, + { + "epoch": 1.7706949977866313, + "grad_norm": 0.3741697669029236, + "learning_rate": 0.0002, + "loss": 1.7346, + "step": 2000 + }, + { + "epoch": 1.7795484727755644, + "grad_norm": 0.4160231053829193, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2010 + }, + { + "epoch": 1.7884019477644975, + "grad_norm": 0.3602111339569092, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 2020 + }, + { + "epoch": 1.7972554227534308, + "grad_norm": 0.36740878224372864, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2030 + }, + { + "epoch": 1.8061088977423638, + "grad_norm": 0.419039249420166, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 2040 + }, + { + "epoch": 1.814962372731297, + "grad_norm": 0.3511838912963867, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 2050 + }, + { + "epoch": 1.8238158477202302, + "grad_norm": 0.3580166697502136, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2060 + }, + { + "epoch": 1.8326693227091635, + "grad_norm": 0.40928223729133606, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 2070 + }, + { + "epoch": 1.8415227976980963, + "grad_norm": 0.37134310603141785, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 2080 + }, + { + "epoch": 1.8503762726870296, + "grad_norm": 0.3924112319946289, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2090 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.3215042054653168, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 2100 + }, + { + "epoch": 1.868083222664896, + "grad_norm": 0.37674015760421753, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2110 + }, + { + "epoch": 1.876936697653829, + "grad_norm": 0.370856374502182, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 2120 + }, + { + "epoch": 1.8857901726427624, + "grad_norm": 0.35783782601356506, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2130 + }, + { + "epoch": 1.8946436476316955, + "grad_norm": 0.39538058638572693, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 2140 + }, + { + "epoch": 1.9034971226206285, + "grad_norm": 0.36677780747413635, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 2150 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.39032700657844543, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 2160 + }, + { + "epoch": 1.921204072598495, + "grad_norm": 0.39762043952941895, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2170 + }, + { + "epoch": 1.930057547587428, + "grad_norm": 0.5400257110595703, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 2180 + }, + { + "epoch": 1.9389110225763613, + "grad_norm": 0.3650212287902832, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2190 + }, + { + "epoch": 1.9477644975652944, + "grad_norm": 0.3583165109157562, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.9566179725542274, + "grad_norm": 0.4031282365322113, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 2210 + }, + { + "epoch": 1.9654714475431607, + "grad_norm": 0.3673221170902252, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 2220 + }, + { + "epoch": 1.9743249225320938, + "grad_norm": 0.3920327126979828, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 2230 + }, + { + "epoch": 1.9831783975210269, + "grad_norm": 0.4765491783618927, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2240 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.38130584359169006, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2250 + }, + { + "epoch": 2.0, + "eval_loss": 1.8077166080474854, + "eval_runtime": 82.8351, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 0.785, + "step": 2259 + }, + { + "epoch": 2.0008853474988935, + "grad_norm": 0.34340235590934753, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 2260 + }, + { + "epoch": 2.0097388224878263, + "grad_norm": 0.3710762858390808, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2270 + }, + { + "epoch": 2.0185922974767596, + "grad_norm": 0.35640114545822144, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2280 + }, + { + "epoch": 2.027445772465693, + "grad_norm": 0.45970189571380615, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 2290 + }, + { + "epoch": 2.0362992474546258, + "grad_norm": 0.4256797134876251, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 2300 + }, + { + "epoch": 2.045152722443559, + "grad_norm": 0.42421531677246094, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2310 + }, + { + "epoch": 2.0540061974324924, + "grad_norm": 0.4032478928565979, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2320 + }, + { + "epoch": 2.062859672421425, + "grad_norm": 0.4073623716831207, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2330 + }, + { + "epoch": 2.0717131474103585, + "grad_norm": 0.4845200777053833, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2340 + }, + { + "epoch": 2.080566622399292, + "grad_norm": 0.40578293800354004, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2350 + }, + { + "epoch": 2.089420097388225, + "grad_norm": 0.4037284255027771, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2360 + }, + { + "epoch": 2.098273572377158, + "grad_norm": 0.4717613160610199, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2370 + }, + { + "epoch": 2.1071270473660912, + "grad_norm": 0.42076411843299866, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 2380 + }, + { + "epoch": 2.1159805223550245, + "grad_norm": 0.47799113392829895, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 2390 + }, + { + "epoch": 2.1248339973439574, + "grad_norm": 0.4253084063529968, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 2400 + }, + { + "epoch": 2.1336874723328907, + "grad_norm": 0.5023085474967957, + "learning_rate": 0.0002, + "loss": 1.6432, + "step": 2410 + }, + { + "epoch": 2.142540947321824, + "grad_norm": 0.49162712693214417, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2420 + }, + { + "epoch": 2.151394422310757, + "grad_norm": 0.39035019278526306, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2430 + }, + { + "epoch": 2.16024789729969, + "grad_norm": 0.43223854899406433, + "learning_rate": 0.0002, + "loss": 1.7526, + "step": 2440 + }, + { + "epoch": 2.1691013722886234, + "grad_norm": 0.4596616327762604, + "learning_rate": 0.0002, + "loss": 1.6334, + "step": 2450 + }, + { + "epoch": 2.1779548472775563, + "grad_norm": 0.4469447731971741, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 2460 + }, + { + "epoch": 2.1868083222664896, + "grad_norm": 0.5100595355033875, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 2470 + }, + { + "epoch": 2.195661797255423, + "grad_norm": 0.4169430732727051, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2480 + }, + { + "epoch": 2.2045152722443557, + "grad_norm": 0.4699254035949707, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 2490 + }, + { + "epoch": 2.213368747233289, + "grad_norm": 0.43524250388145447, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 2500 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.4496648907661438, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2510 + }, + { + "epoch": 2.231075697211155, + "grad_norm": 0.43408212065696716, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2520 + }, + { + "epoch": 2.2399291722000885, + "grad_norm": 0.4596034288406372, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2530 + }, + { + "epoch": 2.2487826471890218, + "grad_norm": 0.5217021107673645, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2540 + }, + { + "epoch": 2.2576361221779546, + "grad_norm": 0.44745638966560364, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 2550 + }, + { + "epoch": 2.266489597166888, + "grad_norm": 0.4484798014163971, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 2560 + }, + { + "epoch": 2.275343072155821, + "grad_norm": 0.4428067207336426, + "learning_rate": 0.0002, + "loss": 1.5321, + "step": 2570 + }, + { + "epoch": 2.2841965471447545, + "grad_norm": 0.5095171332359314, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2580 + }, + { + "epoch": 2.2930500221336874, + "grad_norm": 0.44833096861839294, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 2590 + }, + { + "epoch": 2.3019034971226207, + "grad_norm": 0.507905900478363, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 2600 + }, + { + "epoch": 2.310756972111554, + "grad_norm": 0.40808171033859253, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2610 + }, + { + "epoch": 2.319610447100487, + "grad_norm": 0.4684814214706421, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2620 + }, + { + "epoch": 2.32846392208942, + "grad_norm": 0.44864922761917114, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2630 + }, + { + "epoch": 2.3373173970783534, + "grad_norm": 0.4174162745475769, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2640 + }, + { + "epoch": 2.3461708720672863, + "grad_norm": 0.42314743995666504, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 2650 + }, + { + "epoch": 2.3550243470562195, + "grad_norm": 0.49224185943603516, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2660 + }, + { + "epoch": 2.363877822045153, + "grad_norm": 0.45190292596817017, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 2670 + }, + { + "epoch": 2.3727312970340857, + "grad_norm": 0.41817107796669006, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 2680 + }, + { + "epoch": 2.381584772023019, + "grad_norm": 0.6436763405799866, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2690 + }, + { + "epoch": 2.3904382470119523, + "grad_norm": 0.47175949811935425, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2700 + }, + { + "epoch": 2.3992917220008856, + "grad_norm": 0.480339378118515, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 2710 + }, + { + "epoch": 2.4081451969898184, + "grad_norm": 0.4723486006259918, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 2720 + }, + { + "epoch": 2.4169986719787517, + "grad_norm": 0.4305492043495178, + "learning_rate": 0.0002, + "loss": 1.54, + "step": 2730 + }, + { + "epoch": 2.425852146967685, + "grad_norm": 0.5007492303848267, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2740 + }, + { + "epoch": 2.434705621956618, + "grad_norm": 0.5374062061309814, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 2750 + }, + { + "epoch": 2.443559096945551, + "grad_norm": 0.45866212248802185, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 2760 + }, + { + "epoch": 2.4524125719344845, + "grad_norm": 0.47914502024650574, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 2770 + }, + { + "epoch": 2.4612660469234173, + "grad_norm": 0.43804746866226196, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 2780 + }, + { + "epoch": 2.4701195219123506, + "grad_norm": 0.43656906485557556, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 2790 + }, + { + "epoch": 2.478972996901284, + "grad_norm": 0.4820363521575928, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 2800 + }, + { + "epoch": 2.4878264718902168, + "grad_norm": 0.4916800558567047, + "learning_rate": 0.0002, + "loss": 1.5375, + "step": 2810 + }, + { + "epoch": 2.49667994687915, + "grad_norm": 0.4521256983280182, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2820 + }, + { + "epoch": 2.5055334218680834, + "grad_norm": 0.5066806674003601, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 2830 + }, + { + "epoch": 2.514386896857016, + "grad_norm": 0.4768151640892029, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2840 + }, + { + "epoch": 2.5232403718459495, + "grad_norm": 0.5144683718681335, + "learning_rate": 0.0002, + "loss": 1.6719, + "step": 2850 + }, + { + "epoch": 2.532093846834883, + "grad_norm": 0.4718942940235138, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2860 + }, + { + "epoch": 2.5409473218238157, + "grad_norm": 0.4924587309360504, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2870 + }, + { + "epoch": 2.549800796812749, + "grad_norm": 0.4649953842163086, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2880 + }, + { + "epoch": 2.5586542718016823, + "grad_norm": 0.4836665987968445, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2890 + }, + { + "epoch": 2.567507746790615, + "grad_norm": 0.4162124991416931, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 2900 + }, + { + "epoch": 2.5763612217795484, + "grad_norm": 0.4894537925720215, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2910 + }, + { + "epoch": 2.5852146967684817, + "grad_norm": 0.4539397358894348, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 2920 + }, + { + "epoch": 2.5940681717574146, + "grad_norm": 0.4718773066997528, + "learning_rate": 0.0002, + "loss": 1.6449, + "step": 2930 + }, + { + "epoch": 2.602921646746348, + "grad_norm": 0.49989837408065796, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 2940 + }, + { + "epoch": 2.611775121735281, + "grad_norm": 0.4862406849861145, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2950 + }, + { + "epoch": 2.620628596724214, + "grad_norm": 0.4244804382324219, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 2960 + }, + { + "epoch": 2.6294820717131473, + "grad_norm": 0.49304354190826416, + "learning_rate": 0.0002, + "loss": 1.7795, + "step": 2970 + }, + { + "epoch": 2.6383355467020806, + "grad_norm": 0.4818236529827118, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 2980 + }, + { + "epoch": 2.647189021691014, + "grad_norm": 0.5077425837516785, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 2990 + }, + { + "epoch": 2.6560424966799467, + "grad_norm": 0.4494157135486603, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 3000 + }, + { + "epoch": 2.66489597166888, + "grad_norm": 0.4790278971195221, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 3010 + }, + { + "epoch": 2.6737494466578133, + "grad_norm": 0.4702624976634979, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3020 + }, + { + "epoch": 2.682602921646746, + "grad_norm": 0.5082133412361145, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3030 + }, + { + "epoch": 2.6914563966356795, + "grad_norm": 0.4553256630897522, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 3040 + }, + { + "epoch": 2.700309871624613, + "grad_norm": 0.4492715001106262, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3050 + }, + { + "epoch": 2.709163346613546, + "grad_norm": 0.4555944502353668, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 3060 + }, + { + "epoch": 2.718016821602479, + "grad_norm": 0.5879693031311035, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3070 + }, + { + "epoch": 2.7268702965914122, + "grad_norm": 0.4628562927246094, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3080 + }, + { + "epoch": 2.7357237715803455, + "grad_norm": 0.5169575810432434, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 3090 + }, + { + "epoch": 2.7445772465692784, + "grad_norm": 0.4630090892314911, + "learning_rate": 0.0002, + "loss": 1.562, + "step": 3100 + }, + { + "epoch": 2.7534307215582117, + "grad_norm": 0.5437219738960266, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 3110 + }, + { + "epoch": 2.762284196547145, + "grad_norm": 0.5102152228355408, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 3120 + }, + { + "epoch": 2.771137671536078, + "grad_norm": 0.48287826776504517, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3130 + }, + { + "epoch": 2.779991146525011, + "grad_norm": 0.4671737253665924, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 3140 + }, + { + "epoch": 2.7888446215139444, + "grad_norm": 0.5177035331726074, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3150 + }, + { + "epoch": 2.7976980965028773, + "grad_norm": 0.450989305973053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 3160 + }, + { + "epoch": 2.8065515714918106, + "grad_norm": 0.45007848739624023, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3170 + }, + { + "epoch": 2.815405046480744, + "grad_norm": 0.4600294530391693, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3180 + }, + { + "epoch": 2.8242585214696767, + "grad_norm": 0.485628604888916, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 3190 + }, + { + "epoch": 2.83311199645861, + "grad_norm": 0.49811574816703796, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 3200 + }, + { + "epoch": 2.8419654714475433, + "grad_norm": 0.5012516975402832, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 3210 + }, + { + "epoch": 2.850818946436476, + "grad_norm": 0.4552757740020752, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 3220 + }, + { + "epoch": 2.8596724214254094, + "grad_norm": 0.4539635479450226, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 3230 + }, + { + "epoch": 2.8685258964143427, + "grad_norm": 0.5534685850143433, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 3240 + }, + { + "epoch": 2.8773793714032756, + "grad_norm": 0.4570811688899994, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3250 + }, + { + "epoch": 2.886232846392209, + "grad_norm": 0.48181653022766113, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3260 + }, + { + "epoch": 2.895086321381142, + "grad_norm": 0.4871032238006592, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 3270 + }, + { + "epoch": 2.903939796370075, + "grad_norm": 0.4643239676952362, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3280 + }, + { + "epoch": 2.9127932713590083, + "grad_norm": 0.5024484395980835, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3290 + }, + { + "epoch": 2.9216467463479416, + "grad_norm": 0.4425384998321533, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 3300 + }, + { + "epoch": 2.9305002213368745, + "grad_norm": 0.459168016910553, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 3310 + }, + { + "epoch": 2.939353696325808, + "grad_norm": 0.4950717091560364, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 3320 + }, + { + "epoch": 2.948207171314741, + "grad_norm": 0.4516230523586273, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3330 + }, + { + "epoch": 2.957060646303674, + "grad_norm": 0.49523285031318665, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3340 + }, + { + "epoch": 2.9659141212926072, + "grad_norm": 0.49282631278038025, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 3350 + }, + { + "epoch": 2.9747675962815405, + "grad_norm": 0.45825016498565674, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 3360 + }, + { + "epoch": 2.983621071270474, + "grad_norm": 0.4952891170978546, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 3370 + }, + { + "epoch": 2.9924745462594067, + "grad_norm": 0.42182639241218567, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3380 + }, + { + "epoch": 2.9995573262505535, + "eval_loss": 1.8308420181274414, + "eval_runtime": 82.786, + "eval_samples_per_second": 6.221, + "eval_steps_per_second": 0.785, + "step": 3388 + }, + { + "epoch": 3.00132802124834, + "grad_norm": 0.47721418738365173, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 3390 + }, + { + "epoch": 3.0101814962372733, + "grad_norm": 0.5284923911094666, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 3400 + }, + { + "epoch": 3.019034971226206, + "grad_norm": 0.5607061982154846, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 3410 + }, + { + "epoch": 3.0278884462151394, + "grad_norm": 0.5271363258361816, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 3420 + }, + { + "epoch": 3.0367419212040727, + "grad_norm": 0.48660898208618164, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 3430 + }, + { + "epoch": 3.0455953961930056, + "grad_norm": 0.5767933130264282, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 3440 + }, + { + "epoch": 3.054448871181939, + "grad_norm": 0.5591282248497009, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3450 + }, + { + "epoch": 3.063302346170872, + "grad_norm": 0.5870814323425293, + "learning_rate": 0.0002, + "loss": 1.5112, + "step": 3460 + }, + { + "epoch": 3.072155821159805, + "grad_norm": 0.4861546456813812, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 3470 + }, + { + "epoch": 3.0810092961487383, + "grad_norm": 0.5238925814628601, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 3480 + }, + { + "epoch": 3.0898627711376716, + "grad_norm": 0.5521751046180725, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 3490 + }, + { + "epoch": 3.098716246126605, + "grad_norm": 0.5816575884819031, + "learning_rate": 0.0002, + "loss": 1.4454, + "step": 3500 + }, + { + "epoch": 3.1075697211155378, + "grad_norm": 0.5281513333320618, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 3510 + }, + { + "epoch": 3.116423196104471, + "grad_norm": 0.5847303867340088, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 3520 + }, + { + "epoch": 3.1252766710934043, + "grad_norm": 0.5683517456054688, + "learning_rate": 0.0002, + "loss": 1.5513, + "step": 3530 + }, + { + "epoch": 3.134130146082337, + "grad_norm": 0.5177015662193298, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 3540 + }, + { + "epoch": 3.1429836210712705, + "grad_norm": 0.5922423601150513, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 3550 + }, + { + "epoch": 3.151837096060204, + "grad_norm": 0.7018587589263916, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 3560 + }, + { + "epoch": 3.1606905710491366, + "grad_norm": 0.6152004599571228, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 3570 + }, + { + "epoch": 3.16954404603807, + "grad_norm": 0.5350717902183533, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 3580 + }, + { + "epoch": 3.1783975210270032, + "grad_norm": 0.5971009731292725, + "learning_rate": 0.0002, + "loss": 1.4739, + "step": 3590 + }, + { + "epoch": 3.187250996015936, + "grad_norm": 0.7312001585960388, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3600 + }, + { + "epoch": 3.1961044710048694, + "grad_norm": 0.6372535228729248, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 3610 + }, + { + "epoch": 3.2049579459938027, + "grad_norm": 0.6098020672798157, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 3620 + }, + { + "epoch": 3.2138114209827355, + "grad_norm": 0.5506435632705688, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3630 + }, + { + "epoch": 3.222664895971669, + "grad_norm": 0.6043022274971008, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3640 + }, + { + "epoch": 3.231518370960602, + "grad_norm": 0.5495519042015076, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3650 + }, + { + "epoch": 3.240371845949535, + "grad_norm": 0.5769572257995605, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3660 + }, + { + "epoch": 3.2492253209384683, + "grad_norm": 0.6833786964416504, + "learning_rate": 0.0002, + "loss": 1.4604, + "step": 3670 + }, + { + "epoch": 3.2580787959274016, + "grad_norm": 0.6962856650352478, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3680 + }, + { + "epoch": 3.2669322709163344, + "grad_norm": 0.6553098559379578, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 3690 + }, + { + "epoch": 3.2757857459052677, + "grad_norm": 0.5907557010650635, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 3700 + }, + { + "epoch": 3.284639220894201, + "grad_norm": 0.5712862014770508, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 3710 + }, + { + "epoch": 3.2934926958831343, + "grad_norm": 0.573820948600769, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 3720 + }, + { + "epoch": 3.302346170872067, + "grad_norm": 0.6650304198265076, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 3730 + }, + { + "epoch": 3.3111996458610005, + "grad_norm": 0.5182583928108215, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 3740 + }, + { + "epoch": 3.3200531208499338, + "grad_norm": 0.5078902840614319, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 3750 + }, + { + "epoch": 3.3289065958388666, + "grad_norm": 0.7062374353408813, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 3760 + }, + { + "epoch": 3.3377600708278, + "grad_norm": 0.5711262822151184, + "learning_rate": 0.0002, + "loss": 1.5017, + "step": 3770 + }, + { + "epoch": 3.346613545816733, + "grad_norm": 0.5624606013298035, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 3780 + }, + { + "epoch": 3.355467020805666, + "grad_norm": 0.6008231043815613, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 3790 + }, + { + "epoch": 3.3643204957945994, + "grad_norm": 0.6120018362998962, + "learning_rate": 0.0002, + "loss": 1.5038, + "step": 3800 + }, + { + "epoch": 3.3731739707835326, + "grad_norm": 0.5679979920387268, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3810 + }, + { + "epoch": 3.3820274457724655, + "grad_norm": 0.5613794922828674, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3820 + }, + { + "epoch": 3.390880920761399, + "grad_norm": 0.5328839421272278, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 3830 + }, + { + "epoch": 3.399734395750332, + "grad_norm": 0.5960017442703247, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 3840 + }, + { + "epoch": 3.4085878707392654, + "grad_norm": 0.5264106392860413, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 3850 + }, + { + "epoch": 3.4174413457281982, + "grad_norm": 0.6378359198570251, + "learning_rate": 0.0002, + "loss": 1.4766, + "step": 3860 + }, + { + "epoch": 3.4262948207171315, + "grad_norm": 0.5792967677116394, + "learning_rate": 0.0002, + "loss": 1.4898, + "step": 3870 + }, + { + "epoch": 3.435148295706065, + "grad_norm": 0.6836280822753906, + "learning_rate": 0.0002, + "loss": 1.4914, + "step": 3880 + }, + { + "epoch": 3.4440017706949977, + "grad_norm": 0.6073971390724182, + "learning_rate": 0.0002, + "loss": 1.5002, + "step": 3890 + }, + { + "epoch": 3.452855245683931, + "grad_norm": 0.5753195881843567, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 3900 + }, + { + "epoch": 3.4617087206728643, + "grad_norm": 0.6007646918296814, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3910 + }, + { + "epoch": 3.470562195661797, + "grad_norm": 0.6025636196136475, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3920 + }, + { + "epoch": 3.4794156706507304, + "grad_norm": 0.6819562315940857, + "learning_rate": 0.0002, + "loss": 1.4612, + "step": 3930 + }, + { + "epoch": 3.4882691456396637, + "grad_norm": 0.6448395848274231, + "learning_rate": 0.0002, + "loss": 1.518, + "step": 3940 + }, + { + "epoch": 3.4971226206285966, + "grad_norm": 0.5712178945541382, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 3950 + }, + { + "epoch": 3.50597609561753, + "grad_norm": 0.6300532817840576, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 3960 + }, + { + "epoch": 3.514829570606463, + "grad_norm": 0.6120840907096863, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 3970 + }, + { + "epoch": 3.523683045595396, + "grad_norm": 0.6887575387954712, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 3980 + }, + { + "epoch": 3.5325365205843293, + "grad_norm": 0.6970235109329224, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 3990 + }, + { + "epoch": 3.5413899955732626, + "grad_norm": 0.5818213820457458, + "learning_rate": 0.0002, + "loss": 1.5198, + "step": 4000 + }, + { + "epoch": 3.5502434705621955, + "grad_norm": 1.0533310174942017, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 4010 + }, + { + "epoch": 3.5590969455511288, + "grad_norm": 0.5444280505180359, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4020 + }, + { + "epoch": 3.567950420540062, + "grad_norm": 0.6007506847381592, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 4030 + }, + { + "epoch": 3.576803895528995, + "grad_norm": 0.6088743805885315, + "learning_rate": 0.0002, + "loss": 1.5059, + "step": 4040 + }, + { + "epoch": 3.585657370517928, + "grad_norm": 0.5934239029884338, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 4050 + }, + { + "epoch": 3.5945108455068615, + "grad_norm": 0.605251669883728, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 4060 + }, + { + "epoch": 3.6033643204957944, + "grad_norm": 0.5903469920158386, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 4070 + }, + { + "epoch": 3.6122177954847277, + "grad_norm": 0.6752413511276245, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4080 + }, + { + "epoch": 3.621071270473661, + "grad_norm": 0.5810418725013733, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4090 + }, + { + "epoch": 3.629924745462594, + "grad_norm": 0.5918573141098022, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 4100 + }, + { + "epoch": 3.638778220451527, + "grad_norm": 0.6635358333587646, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 4110 + }, + { + "epoch": 3.6476316954404604, + "grad_norm": 0.5785038471221924, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4120 + }, + { + "epoch": 3.6564851704293937, + "grad_norm": 0.5837879776954651, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4130 + }, + { + "epoch": 3.6653386454183265, + "grad_norm": 0.6449324488639832, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4140 + }, + { + "epoch": 3.67419212040726, + "grad_norm": 0.6191908717155457, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4150 + }, + { + "epoch": 3.683045595396193, + "grad_norm": 0.6937987208366394, + "learning_rate": 0.0002, + "loss": 1.4567, + "step": 4160 + }, + { + "epoch": 3.6918990703851264, + "grad_norm": 0.581128716468811, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 4170 + }, + { + "epoch": 3.7007525453740593, + "grad_norm": 0.6547803282737732, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 4180 + }, + { + "epoch": 3.7096060203629926, + "grad_norm": 0.5961150527000427, + "learning_rate": 0.0002, + "loss": 1.4653, + "step": 4190 + }, + { + "epoch": 3.718459495351926, + "grad_norm": 0.6197913885116577, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 4200 + }, + { + "epoch": 3.7273129703408587, + "grad_norm": 0.688565194606781, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 4210 + }, + { + "epoch": 3.736166445329792, + "grad_norm": 0.5832270979881287, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 4220 + }, + { + "epoch": 3.7450199203187253, + "grad_norm": 0.5643884539604187, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 4230 + }, + { + "epoch": 3.753873395307658, + "grad_norm": 0.6236484050750732, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4240 + }, + { + "epoch": 3.7627268702965915, + "grad_norm": 0.5367720127105713, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 4250 + }, + { + "epoch": 3.7715803452855248, + "grad_norm": 0.5785109400749207, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4260 + }, + { + "epoch": 3.7804338202744576, + "grad_norm": 0.5698465704917908, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4270 + }, + { + "epoch": 3.789287295263391, + "grad_norm": 0.5748036503791809, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4280 + }, + { + "epoch": 3.798140770252324, + "grad_norm": 0.608147382736206, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 4290 + }, + { + "epoch": 3.806994245241257, + "grad_norm": 0.5820456147193909, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4300 + }, + { + "epoch": 3.8158477202301904, + "grad_norm": 0.6325612664222717, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4310 + }, + { + "epoch": 3.8247011952191237, + "grad_norm": 0.6465362310409546, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4320 + }, + { + "epoch": 3.8335546702080565, + "grad_norm": 0.5630854368209839, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4330 + }, + { + "epoch": 3.84240814519699, + "grad_norm": 0.6181462407112122, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4340 + }, + { + "epoch": 3.851261620185923, + "grad_norm": 0.6207571029663086, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 4350 + }, + { + "epoch": 3.860115095174856, + "grad_norm": 0.6092919111251831, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4360 + }, + { + "epoch": 3.8689685701637893, + "grad_norm": 0.6140493750572205, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 4370 + }, + { + "epoch": 3.8778220451527226, + "grad_norm": 0.611575722694397, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 4380 + }, + { + "epoch": 3.8866755201416554, + "grad_norm": 0.6288794279098511, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 4390 + }, + { + "epoch": 3.8955289951305887, + "grad_norm": 0.6518979072570801, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4400 + }, + { + "epoch": 3.904382470119522, + "grad_norm": 0.6144753098487854, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4410 + }, + { + "epoch": 3.913235945108455, + "grad_norm": 0.7034937143325806, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 4420 + }, + { + "epoch": 3.922089420097388, + "grad_norm": 0.5713187456130981, + "learning_rate": 0.0002, + "loss": 1.4978, + "step": 4430 + }, + { + "epoch": 3.9309428950863214, + "grad_norm": 0.6187576651573181, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4440 + }, + { + "epoch": 3.9397963700752543, + "grad_norm": 0.6439383029937744, + "learning_rate": 0.0002, + "loss": 1.551, + "step": 4450 + }, + { + "epoch": 3.9486498450641876, + "grad_norm": 0.6133334636688232, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 4460 + }, + { + "epoch": 3.957503320053121, + "grad_norm": 0.593463659286499, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4470 + }, + { + "epoch": 3.9663567950420537, + "grad_norm": 0.6261998414993286, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4480 + }, + { + "epoch": 3.975210270030987, + "grad_norm": 0.6153767704963684, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4490 + }, + { + "epoch": 3.9840637450199203, + "grad_norm": 0.6184002757072449, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 4500 + }, + { + "epoch": 3.9929172200088536, + "grad_norm": 0.5212734341621399, + "learning_rate": 0.0002, + "loss": 1.5134, + "step": 4510 + }, + { + "epoch": 4.0, + "eval_loss": 1.8745536804199219, + "eval_runtime": 83.0125, + "eval_samples_per_second": 6.204, + "eval_steps_per_second": 0.783, + "step": 4518 + }, + { + "epoch": 4.001770694997787, + "grad_norm": 0.5871603488922119, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4520 + }, + { + "epoch": 4.01062416998672, + "grad_norm": 0.6746091842651367, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 4530 + }, + { + "epoch": 4.019477644975653, + "grad_norm": 0.6159639358520508, + "learning_rate": 0.0002, + "loss": 1.3625, + "step": 4540 + }, + { + "epoch": 4.028331119964586, + "grad_norm": 0.7529398202896118, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 4550 + }, + { + "epoch": 4.037184594953519, + "grad_norm": 0.788398027420044, + "learning_rate": 0.0002, + "loss": 1.3202, + "step": 4560 + }, + { + "epoch": 4.046038069942452, + "grad_norm": 0.9679850935935974, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 4570 + }, + { + "epoch": 4.054891544931386, + "grad_norm": 0.6305310130119324, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 4580 + }, + { + "epoch": 4.063745019920319, + "grad_norm": 0.8557451963424683, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 4590 + }, + { + "epoch": 4.0725984949092515, + "grad_norm": 0.741518497467041, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4600 + }, + { + "epoch": 4.081451969898185, + "grad_norm": 0.6573862433433533, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 4610 + }, + { + "epoch": 4.090305444887118, + "grad_norm": 0.6926319599151611, + "learning_rate": 0.0002, + "loss": 1.3341, + "step": 4620 + }, + { + "epoch": 4.099158919876051, + "grad_norm": 0.9212626218795776, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4630 + }, + { + "epoch": 4.108012394864985, + "grad_norm": 0.7167867422103882, + "learning_rate": 0.0002, + "loss": 1.3402, + "step": 4640 + }, + { + "epoch": 4.116865869853918, + "grad_norm": 0.6691595911979675, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 4650 + }, + { + "epoch": 4.12571934484285, + "grad_norm": 0.8708247542381287, + "learning_rate": 0.0002, + "loss": 1.247, + "step": 4660 + }, + { + "epoch": 4.134572819831784, + "grad_norm": 0.8612170219421387, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 4670 + }, + { + "epoch": 4.143426294820717, + "grad_norm": 0.7688325047492981, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 4680 + }, + { + "epoch": 4.152279769809651, + "grad_norm": 0.7606917023658752, + "learning_rate": 0.0002, + "loss": 1.4349, + "step": 4690 + }, + { + "epoch": 4.161133244798584, + "grad_norm": 0.8241282105445862, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 4700 + }, + { + "epoch": 4.1699867197875164, + "grad_norm": 0.7480464577674866, + "learning_rate": 0.0002, + "loss": 1.3325, + "step": 4710 + }, + { + "epoch": 4.17884019477645, + "grad_norm": 0.7092460989952087, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 4720 + }, + { + "epoch": 4.187693669765383, + "grad_norm": 0.8782108426094055, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4730 + }, + { + "epoch": 4.196547144754316, + "grad_norm": 0.6875300407409668, + "learning_rate": 0.0002, + "loss": 1.3626, + "step": 4740 + }, + { + "epoch": 4.20540061974325, + "grad_norm": 0.7713887691497803, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 4750 + }, + { + "epoch": 4.2142540947321825, + "grad_norm": 0.8270819783210754, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 4760 + }, + { + "epoch": 4.223107569721115, + "grad_norm": 0.7109288573265076, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 4770 + }, + { + "epoch": 4.231961044710049, + "grad_norm": 0.7209359407424927, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 4780 + }, + { + "epoch": 4.240814519698982, + "grad_norm": 0.7142833471298218, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 4790 + }, + { + "epoch": 4.249667994687915, + "grad_norm": 0.8526809811592102, + "learning_rate": 0.0002, + "loss": 1.3654, + "step": 4800 + }, + { + "epoch": 4.2585214696768485, + "grad_norm": 0.7064695954322815, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 4810 + }, + { + "epoch": 4.267374944665781, + "grad_norm": 0.7646124362945557, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 4820 + }, + { + "epoch": 4.276228419654714, + "grad_norm": 0.7377115488052368, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 4830 + }, + { + "epoch": 4.285081894643648, + "grad_norm": 0.7308453321456909, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 4840 + }, + { + "epoch": 4.293935369632581, + "grad_norm": 0.6687684059143066, + "learning_rate": 0.0002, + "loss": 1.3653, + "step": 4850 + }, + { + "epoch": 4.302788844621514, + "grad_norm": 0.7447634339332581, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 4860 + }, + { + "epoch": 4.311642319610447, + "grad_norm": 0.7661601305007935, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 4870 + }, + { + "epoch": 4.32049579459938, + "grad_norm": 0.7492215037345886, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 4880 + }, + { + "epoch": 4.329349269588313, + "grad_norm": 0.9554458856582642, + "learning_rate": 0.0002, + "loss": 1.4089, + "step": 4890 + }, + { + "epoch": 4.338202744577247, + "grad_norm": 0.7409822940826416, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 4900 + }, + { + "epoch": 4.34705621956618, + "grad_norm": 0.9848645329475403, + "learning_rate": 0.0002, + "loss": 1.2581, + "step": 4910 + }, + { + "epoch": 4.355909694555113, + "grad_norm": 0.803995668888092, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4920 + }, + { + "epoch": 4.364763169544046, + "grad_norm": 0.7480606436729431, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 4930 + }, + { + "epoch": 4.373616644532979, + "grad_norm": 0.7018141150474548, + "learning_rate": 0.0002, + "loss": 1.4092, + "step": 4940 + }, + { + "epoch": 4.382470119521912, + "grad_norm": 0.7684932351112366, + "learning_rate": 0.0002, + "loss": 1.4034, + "step": 4950 + }, + { + "epoch": 4.391323594510846, + "grad_norm": 0.7849185466766357, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 4960 + }, + { + "epoch": 4.400177069499779, + "grad_norm": 0.7858862280845642, + "learning_rate": 0.0002, + "loss": 1.3763, + "step": 4970 + }, + { + "epoch": 4.4090305444887115, + "grad_norm": 0.8270778059959412, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4980 + }, + { + "epoch": 4.417884019477645, + "grad_norm": 0.8464101552963257, + "learning_rate": 0.0002, + "loss": 1.445, + "step": 4990 + }, + { + "epoch": 4.426737494466578, + "grad_norm": 0.85670405626297, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5000 + }, + { + "epoch": 4.435590969455511, + "grad_norm": 0.8656655550003052, + "learning_rate": 0.0002, + "loss": 1.4203, + "step": 5010 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7605292201042175, + "learning_rate": 0.0002, + "loss": 1.3426, + "step": 5020 + }, + { + "epoch": 4.4532979194333775, + "grad_norm": 0.7682471871376038, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 5030 + }, + { + "epoch": 4.46215139442231, + "grad_norm": 0.7209102511405945, + "learning_rate": 0.0002, + "loss": 1.3432, + "step": 5040 + }, + { + "epoch": 4.471004869411244, + "grad_norm": 0.8259989023208618, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 5050 + }, + { + "epoch": 4.479858344400177, + "grad_norm": 0.7342197895050049, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 5060 + }, + { + "epoch": 4.48871181938911, + "grad_norm": 0.7869040369987488, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 5070 + }, + { + "epoch": 4.4975652943780435, + "grad_norm": 0.7906143665313721, + "learning_rate": 0.0002, + "loss": 1.3734, + "step": 5080 + }, + { + "epoch": 4.506418769366976, + "grad_norm": 0.7336861491203308, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 5090 + }, + { + "epoch": 4.515272244355909, + "grad_norm": 0.8264166712760925, + "learning_rate": 0.0002, + "loss": 1.3768, + "step": 5100 + }, + { + "epoch": 4.524125719344843, + "grad_norm": 0.8144693970680237, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 5110 + }, + { + "epoch": 4.532979194333776, + "grad_norm": 0.8257269263267517, + "learning_rate": 0.0002, + "loss": 1.3044, + "step": 5120 + }, + { + "epoch": 4.541832669322709, + "grad_norm": 0.8838174343109131, + "learning_rate": 0.0002, + "loss": 1.3501, + "step": 5130 + }, + { + "epoch": 4.550686144311642, + "grad_norm": 0.7081145644187927, + "learning_rate": 0.0002, + "loss": 1.3464, + "step": 5140 + }, + { + "epoch": 4.559539619300575, + "grad_norm": 0.7137823700904846, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 5150 + }, + { + "epoch": 4.568393094289509, + "grad_norm": 0.7890386581420898, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 5160 + }, + { + "epoch": 4.577246569278442, + "grad_norm": 0.6418015360832214, + "learning_rate": 0.0002, + "loss": 1.3368, + "step": 5170 + }, + { + "epoch": 4.586100044267375, + "grad_norm": 0.768373966217041, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 5180 + }, + { + "epoch": 4.5949535192563085, + "grad_norm": 0.6934067606925964, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5190 + }, + { + "epoch": 4.603806994245241, + "grad_norm": 0.9430719017982483, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 5200 + }, + { + "epoch": 4.612660469234174, + "grad_norm": 0.880264163017273, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 5210 + }, + { + "epoch": 4.621513944223108, + "grad_norm": 0.7584623098373413, + "learning_rate": 0.0002, + "loss": 1.3506, + "step": 5220 + }, + { + "epoch": 4.630367419212041, + "grad_norm": 0.7974506616592407, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 5230 + }, + { + "epoch": 4.639220894200974, + "grad_norm": 0.8812133073806763, + "learning_rate": 0.0002, + "loss": 1.3818, + "step": 5240 + }, + { + "epoch": 4.648074369189907, + "grad_norm": 0.8968724012374878, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 5250 + }, + { + "epoch": 4.65692784417884, + "grad_norm": 0.7317764759063721, + "learning_rate": 0.0002, + "loss": 1.3327, + "step": 5260 + }, + { + "epoch": 4.665781319167773, + "grad_norm": 0.7415484189987183, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 5270 + }, + { + "epoch": 4.674634794156707, + "grad_norm": 0.7867009043693542, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 5280 + }, + { + "epoch": 4.68348826914564, + "grad_norm": 0.6895416378974915, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 5290 + }, + { + "epoch": 4.6923417441345725, + "grad_norm": 0.7324506640434265, + "learning_rate": 0.0002, + "loss": 1.3438, + "step": 5300 + }, + { + "epoch": 4.701195219123506, + "grad_norm": 0.7383193969726562, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 5310 + }, + { + "epoch": 4.710048694112439, + "grad_norm": 0.8254916071891785, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 5320 + }, + { + "epoch": 4.718902169101372, + "grad_norm": 0.8161033987998962, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 5330 + }, + { + "epoch": 4.727755644090306, + "grad_norm": 0.7664386034011841, + "learning_rate": 0.0002, + "loss": 1.3623, + "step": 5340 + }, + { + "epoch": 4.7366091190792385, + "grad_norm": 0.7465475797653198, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 5350 + }, + { + "epoch": 4.745462594068171, + "grad_norm": 0.7810078263282776, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 5360 + }, + { + "epoch": 4.754316069057105, + "grad_norm": 0.7428439855575562, + "learning_rate": 0.0002, + "loss": 1.4489, + "step": 5370 + }, + { + "epoch": 4.763169544046038, + "grad_norm": 0.9548320174217224, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 5380 + }, + { + "epoch": 4.772023019034972, + "grad_norm": 0.7959533333778381, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 5390 + }, + { + "epoch": 4.780876494023905, + "grad_norm": 0.747473418712616, + "learning_rate": 0.0002, + "loss": 1.3448, + "step": 5400 + }, + { + "epoch": 4.789729969012837, + "grad_norm": 0.7863122820854187, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 5410 + }, + { + "epoch": 4.798583444001771, + "grad_norm": 0.7769626379013062, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 5420 + }, + { + "epoch": 4.807436918990704, + "grad_norm": 0.8551191091537476, + "learning_rate": 0.0002, + "loss": 1.4484, + "step": 5430 + }, + { + "epoch": 4.816290393979637, + "grad_norm": 0.8364850878715515, + "learning_rate": 0.0002, + "loss": 1.4314, + "step": 5440 + }, + { + "epoch": 4.825143868968571, + "grad_norm": 0.7458856701850891, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 5450 + }, + { + "epoch": 4.8339973439575035, + "grad_norm": 0.7558291554450989, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 5460 + }, + { + "epoch": 4.842850818946436, + "grad_norm": 0.8396534323692322, + "learning_rate": 0.0002, + "loss": 1.3343, + "step": 5470 + }, + { + "epoch": 4.85170429393537, + "grad_norm": 0.7790794968605042, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 5480 + }, + { + "epoch": 4.860557768924303, + "grad_norm": 0.8607641458511353, + "learning_rate": 0.0002, + "loss": 1.406, + "step": 5490 + }, + { + "epoch": 4.869411243913236, + "grad_norm": 0.828134298324585, + "learning_rate": 0.0002, + "loss": 1.4011, + "step": 5500 + }, + { + "epoch": 4.8782647189021695, + "grad_norm": 0.8783106803894043, + "learning_rate": 0.0002, + "loss": 1.4089, + "step": 5510 + }, + { + "epoch": 4.887118193891102, + "grad_norm": 0.7476183176040649, + "learning_rate": 0.0002, + "loss": 1.4565, + "step": 5520 + }, + { + "epoch": 4.895971668880035, + "grad_norm": 0.8023254871368408, + "learning_rate": 0.0002, + "loss": 1.3974, + "step": 5530 + }, + { + "epoch": 4.904825143868969, + "grad_norm": 0.8021706938743591, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 5540 + }, + { + "epoch": 4.913678618857902, + "grad_norm": 0.7873618602752686, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 5550 + }, + { + "epoch": 4.922532093846835, + "grad_norm": 0.7181428670883179, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 5560 + }, + { + "epoch": 4.931385568835768, + "grad_norm": 0.7464273571968079, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 5570 + }, + { + "epoch": 4.940239043824701, + "grad_norm": 0.7433671355247498, + "learning_rate": 0.0002, + "loss": 1.3184, + "step": 5580 + }, + { + "epoch": 4.949092518813634, + "grad_norm": 0.7571114301681519, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 5590 + }, + { + "epoch": 4.957945993802568, + "grad_norm": 0.7811630964279175, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 5600 + }, + { + "epoch": 4.966799468791501, + "grad_norm": 0.7609148621559143, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 5610 + }, + { + "epoch": 4.9756529437804335, + "grad_norm": 0.7324382066726685, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 5620 + }, + { + "epoch": 4.984506418769367, + "grad_norm": 0.9249559640884399, + "learning_rate": 0.0002, + "loss": 1.4557, + "step": 5630 + }, + { + "epoch": 4.9933598937583, + "grad_norm": 0.7852522134780884, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 5640 + }, + { + "epoch": 4.999557326250553, + "eval_loss": 1.9384633302688599, + "eval_runtime": 82.6042, + "eval_samples_per_second": 6.235, + "eval_steps_per_second": 0.787, + "step": 5647 + }, + { + "epoch": 5.002213368747233, + "grad_norm": 0.8052749037742615, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 5650 + }, + { + "epoch": 5.011066843736167, + "grad_norm": 1.380603551864624, + "learning_rate": 0.0002, + "loss": 1.1967, + "step": 5660 + }, + { + "epoch": 5.0199203187251, + "grad_norm": 0.9197829365730286, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 5670 + }, + { + "epoch": 5.028773793714032, + "grad_norm": 0.9338570833206177, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 5680 + }, + { + "epoch": 5.037627268702966, + "grad_norm": 1.0464060306549072, + "learning_rate": 0.0002, + "loss": 1.1866, + "step": 5690 + }, + { + "epoch": 5.046480743691899, + "grad_norm": 0.9055638909339905, + "learning_rate": 0.0002, + "loss": 1.2211, + "step": 5700 + }, + { + "epoch": 5.055334218680832, + "grad_norm": 0.9494627714157104, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 5710 + }, + { + "epoch": 5.064187693669766, + "grad_norm": 0.9680962562561035, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 5720 + }, + { + "epoch": 5.0730411686586985, + "grad_norm": 1.0254695415496826, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 5730 + }, + { + "epoch": 5.081894643647631, + "grad_norm": 0.9306758642196655, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 5740 + }, + { + "epoch": 5.090748118636565, + "grad_norm": 1.0620356798171997, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 5750 + }, + { + "epoch": 5.099601593625498, + "grad_norm": 1.0401700735092163, + "learning_rate": 0.0002, + "loss": 1.2628, + "step": 5760 + }, + { + "epoch": 5.108455068614431, + "grad_norm": 0.9916906952857971, + "learning_rate": 0.0002, + "loss": 1.1976, + "step": 5770 + }, + { + "epoch": 5.1173085436033645, + "grad_norm": 0.8387252688407898, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 5780 + }, + { + "epoch": 5.126162018592297, + "grad_norm": 0.9870850443840027, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 5790 + }, + { + "epoch": 5.13501549358123, + "grad_norm": 0.9204064011573792, + "learning_rate": 0.0002, + "loss": 1.1902, + "step": 5800 + }, + { + "epoch": 5.143868968570164, + "grad_norm": 0.9951931834220886, + "learning_rate": 0.0002, + "loss": 1.2266, + "step": 5810 + }, + { + "epoch": 5.152722443559097, + "grad_norm": 0.9745809435844421, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 5820 + }, + { + "epoch": 5.16157591854803, + "grad_norm": 0.9467785954475403, + "learning_rate": 0.0002, + "loss": 1.2549, + "step": 5830 + }, + { + "epoch": 5.170429393536963, + "grad_norm": 1.0451668500900269, + "learning_rate": 0.0002, + "loss": 1.2309, + "step": 5840 + }, + { + "epoch": 5.179282868525896, + "grad_norm": 0.9740142822265625, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 5850 + }, + { + "epoch": 5.18813634351483, + "grad_norm": 1.2158266305923462, + "learning_rate": 0.0002, + "loss": 1.2137, + "step": 5860 + }, + { + "epoch": 5.196989818503763, + "grad_norm": 1.0795036554336548, + "learning_rate": 0.0002, + "loss": 1.1631, + "step": 5870 + }, + { + "epoch": 5.205843293492696, + "grad_norm": 0.9578470587730408, + "learning_rate": 0.0002, + "loss": 1.1448, + "step": 5880 + }, + { + "epoch": 5.214696768481629, + "grad_norm": 0.8887509703636169, + "learning_rate": 0.0002, + "loss": 1.2183, + "step": 5890 + }, + { + "epoch": 5.223550243470562, + "grad_norm": 1.171006441116333, + "learning_rate": 0.0002, + "loss": 1.1991, + "step": 5900 + }, + { + "epoch": 5.232403718459495, + "grad_norm": 0.9016029834747314, + "learning_rate": 0.0002, + "loss": 1.1781, + "step": 5910 + }, + { + "epoch": 5.241257193448429, + "grad_norm": 1.173136830329895, + "learning_rate": 0.0002, + "loss": 1.2057, + "step": 5920 + }, + { + "epoch": 5.250110668437362, + "grad_norm": 0.8760318160057068, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 5930 + }, + { + "epoch": 5.258964143426295, + "grad_norm": 0.8998854160308838, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 5940 + }, + { + "epoch": 5.267817618415228, + "grad_norm": 1.017175316810608, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 5950 + }, + { + "epoch": 5.276671093404161, + "grad_norm": 0.8646609783172607, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 5960 + }, + { + "epoch": 5.285524568393094, + "grad_norm": 1.0030627250671387, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 5970 + }, + { + "epoch": 5.294378043382028, + "grad_norm": 0.975911557674408, + "learning_rate": 0.0002, + "loss": 1.2365, + "step": 5980 + }, + { + "epoch": 5.303231518370961, + "grad_norm": 0.9576130509376526, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 5990 + }, + { + "epoch": 5.3120849933598935, + "grad_norm": 0.9566167593002319, + "learning_rate": 0.0002, + "loss": 1.2681, + "step": 6000 + }, + { + "epoch": 5.320938468348827, + "grad_norm": 0.9200350642204285, + "learning_rate": 0.0002, + "loss": 1.2029, + "step": 6010 + }, + { + "epoch": 5.32979194333776, + "grad_norm": 1.0491118431091309, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 6020 + }, + { + "epoch": 5.338645418326693, + "grad_norm": 1.1199153661727905, + "learning_rate": 0.0002, + "loss": 1.2531, + "step": 6030 + }, + { + "epoch": 5.347498893315627, + "grad_norm": 1.015252947807312, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 6040 + }, + { + "epoch": 5.3563523683045595, + "grad_norm": 1.1076666116714478, + "learning_rate": 0.0002, + "loss": 1.2208, + "step": 6050 + }, + { + "epoch": 5.365205843293492, + "grad_norm": 0.9224653840065002, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 6060 + }, + { + "epoch": 5.374059318282426, + "grad_norm": 1.0079779624938965, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 6070 + }, + { + "epoch": 5.382912793271359, + "grad_norm": 0.9627894759178162, + "learning_rate": 0.0002, + "loss": 1.2612, + "step": 6080 + }, + { + "epoch": 5.391766268260292, + "grad_norm": 1.0503166913986206, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 6090 + }, + { + "epoch": 5.400619743249226, + "grad_norm": 0.912736713886261, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 6100 + }, + { + "epoch": 5.409473218238158, + "grad_norm": 1.2552032470703125, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 6110 + }, + { + "epoch": 5.418326693227091, + "grad_norm": 0.986230731010437, + "learning_rate": 0.0002, + "loss": 1.2738, + "step": 6120 + }, + { + "epoch": 5.427180168216025, + "grad_norm": 0.9869757294654846, + "learning_rate": 0.0002, + "loss": 1.3301, + "step": 6130 + }, + { + "epoch": 5.436033643204958, + "grad_norm": 1.012027621269226, + "learning_rate": 0.0002, + "loss": 1.241, + "step": 6140 + }, + { + "epoch": 5.444887118193891, + "grad_norm": 0.8855568170547485, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 6150 + }, + { + "epoch": 5.4537405931828244, + "grad_norm": 1.1522414684295654, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6160 + }, + { + "epoch": 5.462594068171757, + "grad_norm": 1.2448474168777466, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 6170 + }, + { + "epoch": 5.471447543160691, + "grad_norm": 1.0362223386764526, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 6180 + }, + { + "epoch": 5.480301018149624, + "grad_norm": 0.9363031983375549, + "learning_rate": 0.0002, + "loss": 1.2351, + "step": 6190 + }, + { + "epoch": 5.489154493138557, + "grad_norm": 0.8852020502090454, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 6200 + }, + { + "epoch": 5.4980079681274905, + "grad_norm": 0.8577062487602234, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 6210 + }, + { + "epoch": 5.506861443116423, + "grad_norm": 0.9351891875267029, + "learning_rate": 0.0002, + "loss": 1.2547, + "step": 6220 + }, + { + "epoch": 5.515714918105356, + "grad_norm": 1.0031992197036743, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 6230 + }, + { + "epoch": 5.52456839309429, + "grad_norm": 0.9935104250907898, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 6240 + }, + { + "epoch": 5.533421868083223, + "grad_norm": 1.1086243391036987, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 6250 + }, + { + "epoch": 5.542275343072156, + "grad_norm": 0.990772545337677, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 6260 + }, + { + "epoch": 5.551128818061089, + "grad_norm": 0.9317597150802612, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 6270 + }, + { + "epoch": 5.559982293050022, + "grad_norm": 0.9657552242279053, + "learning_rate": 0.0002, + "loss": 1.2095, + "step": 6280 + }, + { + "epoch": 5.568835768038955, + "grad_norm": 1.0271565914154053, + "learning_rate": 0.0002, + "loss": 1.2435, + "step": 6290 + }, + { + "epoch": 5.577689243027889, + "grad_norm": 0.916253924369812, + "learning_rate": 0.0002, + "loss": 1.2283, + "step": 6300 + }, + { + "epoch": 5.586542718016822, + "grad_norm": 1.0083940029144287, + "learning_rate": 0.0002, + "loss": 1.2648, + "step": 6310 + }, + { + "epoch": 5.5953961930057545, + "grad_norm": 0.9740358591079712, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 6320 + }, + { + "epoch": 5.604249667994688, + "grad_norm": 0.9645405411720276, + "learning_rate": 0.0002, + "loss": 1.2507, + "step": 6330 + }, + { + "epoch": 5.613103142983621, + "grad_norm": 0.9677100777626038, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 6340 + }, + { + "epoch": 5.621956617972554, + "grad_norm": 0.9706602692604065, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 6350 + }, + { + "epoch": 5.630810092961488, + "grad_norm": 1.1492316722869873, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 6360 + }, + { + "epoch": 5.639663567950421, + "grad_norm": 0.8857277035713196, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 6370 + }, + { + "epoch": 5.648517042939353, + "grad_norm": 1.0363037586212158, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 6380 + }, + { + "epoch": 5.657370517928287, + "grad_norm": 0.9621800780296326, + "learning_rate": 0.0002, + "loss": 1.1838, + "step": 6390 + }, + { + "epoch": 5.66622399291722, + "grad_norm": 0.9937820434570312, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 6400 + }, + { + "epoch": 5.675077467906153, + "grad_norm": 0.9491283297538757, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6410 + }, + { + "epoch": 5.683930942895087, + "grad_norm": 0.9429448246955872, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6420 + }, + { + "epoch": 5.6927844178840195, + "grad_norm": 0.9808844327926636, + "learning_rate": 0.0002, + "loss": 1.1663, + "step": 6430 + }, + { + "epoch": 5.701637892872952, + "grad_norm": 0.8191056847572327, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 6440 + }, + { + "epoch": 5.710491367861886, + "grad_norm": 1.1118974685668945, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 6450 + }, + { + "epoch": 5.719344842850819, + "grad_norm": 0.9030969142913818, + "learning_rate": 0.0002, + "loss": 1.2192, + "step": 6460 + }, + { + "epoch": 5.728198317839752, + "grad_norm": 1.0509997606277466, + "learning_rate": 0.0002, + "loss": 1.301, + "step": 6470 + }, + { + "epoch": 5.7370517928286855, + "grad_norm": 1.0369981527328491, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 6480 + }, + { + "epoch": 5.745905267817618, + "grad_norm": 0.8626071214675903, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 6490 + }, + { + "epoch": 5.754758742806551, + "grad_norm": 1.0448849201202393, + "learning_rate": 0.0002, + "loss": 1.2446, + "step": 6500 + }, + { + "epoch": 5.763612217795485, + "grad_norm": 0.9333119988441467, + "learning_rate": 0.0002, + "loss": 1.2698, + "step": 6510 + }, + { + "epoch": 5.772465692784418, + "grad_norm": 0.8533532023429871, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 6520 + }, + { + "epoch": 5.781319167773351, + "grad_norm": 0.9774261713027954, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6530 + }, + { + "epoch": 5.790172642762284, + "grad_norm": 0.9841071963310242, + "learning_rate": 0.0002, + "loss": 1.2031, + "step": 6540 + }, + { + "epoch": 5.799026117751217, + "grad_norm": 0.9891805052757263, + "learning_rate": 0.0002, + "loss": 1.2767, + "step": 6550 + }, + { + "epoch": 5.80787959274015, + "grad_norm": 0.9633952379226685, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 6560 + }, + { + "epoch": 5.816733067729084, + "grad_norm": 1.327634334564209, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 6570 + }, + { + "epoch": 5.825586542718017, + "grad_norm": 0.9805197715759277, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 6580 + }, + { + "epoch": 5.8344400177069495, + "grad_norm": 1.020957589149475, + "learning_rate": 0.0002, + "loss": 1.1933, + "step": 6590 + }, + { + "epoch": 5.843293492695883, + "grad_norm": 0.9694032669067383, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6600 + }, + { + "epoch": 5.852146967684816, + "grad_norm": 0.8980914354324341, + "learning_rate": 0.0002, + "loss": 1.2671, + "step": 6610 + }, + { + "epoch": 5.861000442673749, + "grad_norm": 0.8312330842018127, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 6620 + }, + { + "epoch": 5.869853917662683, + "grad_norm": 0.9773725271224976, + "learning_rate": 0.0002, + "loss": 1.3301, + "step": 6630 + }, + { + "epoch": 5.878707392651616, + "grad_norm": 0.9684233665466309, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 6640 + }, + { + "epoch": 5.887560867640548, + "grad_norm": 0.8436519503593445, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 6650 + }, + { + "epoch": 5.896414342629482, + "grad_norm": 0.9129888415336609, + "learning_rate": 0.0002, + "loss": 1.2213, + "step": 6660 + }, + { + "epoch": 5.905267817618415, + "grad_norm": 0.8871369957923889, + "learning_rate": 0.0002, + "loss": 1.3272, + "step": 6670 + }, + { + "epoch": 5.914121292607349, + "grad_norm": 0.9544420838356018, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 6680 + }, + { + "epoch": 5.922974767596282, + "grad_norm": 0.9607448577880859, + "learning_rate": 0.0002, + "loss": 1.2954, + "step": 6690 + }, + { + "epoch": 5.9318282425852145, + "grad_norm": 0.9675708413124084, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 6700 + }, + { + "epoch": 5.940681717574148, + "grad_norm": 0.9373534321784973, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 6710 + }, + { + "epoch": 5.949535192563081, + "grad_norm": 0.9750351905822754, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 6720 + }, + { + "epoch": 5.958388667552014, + "grad_norm": 0.9122727513313293, + "learning_rate": 0.0002, + "loss": 1.2575, + "step": 6730 + }, + { + "epoch": 5.967242142540948, + "grad_norm": 0.9300726652145386, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 6740 + }, + { + "epoch": 5.9760956175298805, + "grad_norm": 0.972944438457489, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 6750 + }, + { + "epoch": 5.984949092518813, + "grad_norm": 1.2385832071304321, + "learning_rate": 0.0002, + "loss": 1.3252, + "step": 6760 + }, + { + "epoch": 5.993802567507747, + "grad_norm": 0.9080338478088379, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 6770 + }, + { + "epoch": 6.0, + "eval_loss": 2.062002658843994, + "eval_runtime": 83.2814, + "eval_samples_per_second": 6.184, + "eval_steps_per_second": 0.78, + "step": 6777 + } + ], + "logging_steps": 10, + "max_steps": 9032, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.136243856824074e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-6777/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a2181bbf57942a1396ce6dacbc8f8708239ed4c9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c9184c5b57c480a25724799ac6bf43f5c73d94fe3a578bb664ddd2ea6e5abb4 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f0440e114d479a7b0a43f2bea3d727acc9c6217 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:705a42a41c4919319cf04956afe6ffb8053c25d1e916ed87351cae2333d51116 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b107ba45f93ea7d0a9f14219734f77030ae1986d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3966b079a2816ec926289cb004a0decf735f7f15967256be8e84036e12ea4bbf +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2291dc9e25d9c61f376d175aff6783a943511ce --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1382cd9a54b20bb9286803220dd7cb2d2d9e0fab45f1d10c60c2c5620036f4e4 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2346b4d71293c10c9a06063dfbe782bfb8009c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/trainer_state.json @@ -0,0 +1,5619 @@ +{ + "best_metric": 1.8077166080474854, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", + "epoch": 6.999557326250553, + "eval_steps": 10, + "global_step": 7906, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008853474988933156, + "grad_norm": 0.4775333106517792, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 10 + }, + { + "epoch": 0.017706949977866312, + "grad_norm": 0.5485824346542358, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 20 + }, + { + "epoch": 0.02656042496679947, + "grad_norm": 0.5675218105316162, + "learning_rate": 0.0002, + "loss": 2.0984, + "step": 30 + }, + { + "epoch": 0.035413899955732624, + "grad_norm": 0.696494460105896, + "learning_rate": 0.0002, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.04426737494466578, + "grad_norm": 0.4788398742675781, + "learning_rate": 0.0002, + "loss": 1.9547, + "step": 50 + }, + { + "epoch": 0.05312084993359894, + "grad_norm": 0.4763128161430359, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 60 + }, + { + "epoch": 0.0619743249225321, + "grad_norm": 0.5929698348045349, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 70 + }, + { + "epoch": 0.07082779991146525, + "grad_norm": 0.5899396538734436, + "learning_rate": 0.0002, + "loss": 1.9573, + "step": 80 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.460123747587204, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 90 + }, + { + "epoch": 0.08853474988933156, + "grad_norm": 0.4184812009334564, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 100 + }, + { + "epoch": 0.09738822487826472, + "grad_norm": 0.4051891267299652, + "learning_rate": 0.0002, + "loss": 1.8079, + "step": 110 + }, + { + "epoch": 0.10624169986719788, + "grad_norm": 0.3709661066532135, + "learning_rate": 0.0002, + "loss": 1.8911, + "step": 120 + }, + { + "epoch": 0.11509517485613104, + "grad_norm": 0.4783487915992737, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 130 + }, + { + "epoch": 0.1239486498450642, + "grad_norm": 0.36478137969970703, + "learning_rate": 0.0002, + "loss": 1.8602, + "step": 140 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.4005294442176819, + "learning_rate": 0.0002, + "loss": 1.7814, + "step": 150 + }, + { + "epoch": 0.1416555998229305, + "grad_norm": 0.42357513308525085, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 160 + }, + { + "epoch": 0.15050907481186365, + "grad_norm": 0.3913971781730652, + "learning_rate": 0.0002, + "loss": 1.8835, + "step": 170 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.4650019407272339, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 180 + }, + { + "epoch": 0.16821602478972997, + "grad_norm": 0.5545958876609802, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 190 + }, + { + "epoch": 0.17706949977866313, + "grad_norm": 0.3669356107711792, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 200 + }, + { + "epoch": 0.18592297476759628, + "grad_norm": 0.3683622181415558, + "learning_rate": 0.0002, + "loss": 1.8169, + "step": 210 + }, + { + "epoch": 0.19477644975652944, + "grad_norm": 0.39825671911239624, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 220 + }, + { + "epoch": 0.2036299247454626, + "grad_norm": 0.4298318326473236, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 230 + }, + { + "epoch": 0.21248339973439576, + "grad_norm": 0.36111244559288025, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 240 + }, + { + "epoch": 0.2213368747233289, + "grad_norm": 0.3711858093738556, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 250 + }, + { + "epoch": 0.23019034971226207, + "grad_norm": 0.37717559933662415, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 260 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.3678877651691437, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 270 + }, + { + "epoch": 0.2478972996901284, + "grad_norm": 0.4165912866592407, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 280 + }, + { + "epoch": 0.25675077467906154, + "grad_norm": 0.3403240740299225, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 290 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.4023234248161316, + "learning_rate": 0.0002, + "loss": 1.8704, + "step": 300 + }, + { + "epoch": 0.27445772465692786, + "grad_norm": 0.32472360134124756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 310 + }, + { + "epoch": 0.283311199645861, + "grad_norm": 0.36464595794677734, + "learning_rate": 0.0002, + "loss": 1.8544, + "step": 320 + }, + { + "epoch": 0.2921646746347942, + "grad_norm": 0.3868598937988281, + "learning_rate": 0.0002, + "loss": 1.8168, + "step": 330 + }, + { + "epoch": 0.3010181496237273, + "grad_norm": 0.3123539686203003, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 340 + }, + { + "epoch": 0.3098716246126605, + "grad_norm": 0.3392639458179474, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 350 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.42070651054382324, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 360 + }, + { + "epoch": 0.3275785745905268, + "grad_norm": 0.3650900423526764, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 370 + }, + { + "epoch": 0.33643204957945994, + "grad_norm": 0.41388973593711853, + "learning_rate": 0.0002, + "loss": 1.8388, + "step": 380 + }, + { + "epoch": 0.3452855245683931, + "grad_norm": 0.36625272035598755, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 390 + }, + { + "epoch": 0.35413899955732625, + "grad_norm": 0.3930284082889557, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 400 + }, + { + "epoch": 0.3629924745462594, + "grad_norm": 0.3415820300579071, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 410 + }, + { + "epoch": 0.37184594953519257, + "grad_norm": 0.4256570041179657, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 420 + }, + { + "epoch": 0.3806994245241257, + "grad_norm": 0.3740842938423157, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 430 + }, + { + "epoch": 0.3895528995130589, + "grad_norm": 0.334108829498291, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 440 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.33186739683151245, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 450 + }, + { + "epoch": 0.4072598494909252, + "grad_norm": 0.39127954840660095, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 460 + }, + { + "epoch": 0.4161133244798583, + "grad_norm": 0.331443727016449, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 470 + }, + { + "epoch": 0.4249667994687915, + "grad_norm": 0.36834150552749634, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 480 + }, + { + "epoch": 0.43382027445772464, + "grad_norm": 0.338123619556427, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 490 + }, + { + "epoch": 0.4426737494466578, + "grad_norm": 0.3891060948371887, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 500 + }, + { + "epoch": 0.45152722443559096, + "grad_norm": 0.3486529290676117, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 510 + }, + { + "epoch": 0.46038069942452414, + "grad_norm": 0.3635135889053345, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 520 + }, + { + "epoch": 0.4692341744134573, + "grad_norm": 0.7706693410873413, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 530 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.33725443482398987, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 540 + }, + { + "epoch": 0.4869411243913236, + "grad_norm": 0.3127504289150238, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 550 + }, + { + "epoch": 0.4957945993802568, + "grad_norm": 0.3527977466583252, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 560 + }, + { + "epoch": 0.5046480743691899, + "grad_norm": 0.3574548661708832, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 570 + }, + { + "epoch": 0.5135015493581231, + "grad_norm": 0.32787248492240906, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 580 + }, + { + "epoch": 0.5223550243470563, + "grad_norm": 0.3309430778026581, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 590 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 0.34276407957077026, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 600 + }, + { + "epoch": 0.5400619743249225, + "grad_norm": 0.3343711495399475, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 610 + }, + { + "epoch": 0.5489154493138557, + "grad_norm": 0.3193040192127228, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 620 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.3059828579425812, + "learning_rate": 0.0002, + "loss": 1.7769, + "step": 630 + }, + { + "epoch": 0.566622399291722, + "grad_norm": 0.37237173318862915, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 640 + }, + { + "epoch": 0.5754758742806552, + "grad_norm": 0.36022549867630005, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 650 + }, + { + "epoch": 0.5843293492695883, + "grad_norm": 0.34974920749664307, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 660 + }, + { + "epoch": 0.5931828242585214, + "grad_norm": 0.37135401368141174, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.6020362992474546, + "grad_norm": 0.3385699689388275, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 680 + }, + { + "epoch": 0.6108897742363878, + "grad_norm": 0.36015814542770386, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 690 + }, + { + "epoch": 0.619743249225321, + "grad_norm": 0.3503795564174652, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 700 + }, + { + "epoch": 0.628596724214254, + "grad_norm": 0.3447190225124359, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 710 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.3193499445915222, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 720 + }, + { + "epoch": 0.6463036741921204, + "grad_norm": 0.37058180570602417, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 730 + }, + { + "epoch": 0.6551571491810536, + "grad_norm": 0.42216411232948303, + "learning_rate": 0.0002, + "loss": 1.8391, + "step": 740 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.3091185688972473, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 750 + }, + { + "epoch": 0.6728640991589199, + "grad_norm": 0.33168601989746094, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 760 + }, + { + "epoch": 0.6817175741478531, + "grad_norm": 0.31269341707229614, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 770 + }, + { + "epoch": 0.6905710491367862, + "grad_norm": 0.36125293374061584, + "learning_rate": 0.0002, + "loss": 1.8526, + "step": 780 + }, + { + "epoch": 0.6994245241257193, + "grad_norm": 0.3145293593406677, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 790 + }, + { + "epoch": 0.7082779991146525, + "grad_norm": 0.3611990809440613, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.3165971636772156, + "learning_rate": 0.0002, + "loss": 1.892, + "step": 810 + }, + { + "epoch": 0.7259849490925188, + "grad_norm": 0.3364323675632477, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 820 + }, + { + "epoch": 0.734838424081452, + "grad_norm": 0.4310600757598877, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 830 + }, + { + "epoch": 0.7436918990703851, + "grad_norm": 0.3414389491081238, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 840 + }, + { + "epoch": 0.7525453740593183, + "grad_norm": 0.35536202788352966, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 850 + }, + { + "epoch": 0.7613988490482514, + "grad_norm": 0.3232460618019104, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 860 + }, + { + "epoch": 0.7702523240371846, + "grad_norm": 0.32734858989715576, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 870 + }, + { + "epoch": 0.7791057990261178, + "grad_norm": 0.3433493673801422, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 880 + }, + { + "epoch": 0.787959274015051, + "grad_norm": 0.33354780077934265, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 890 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.30728545784950256, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 900 + }, + { + "epoch": 0.8056662239929172, + "grad_norm": 0.3373030126094818, + "learning_rate": 0.0002, + "loss": 1.8267, + "step": 910 + }, + { + "epoch": 0.8145196989818504, + "grad_norm": 0.3468782603740692, + "learning_rate": 0.0002, + "loss": 1.8479, + "step": 920 + }, + { + "epoch": 0.8233731739707836, + "grad_norm": 0.33520200848579407, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 930 + }, + { + "epoch": 0.8322266489597167, + "grad_norm": 0.35207098722457886, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 0.8410801239486498, + "grad_norm": 0.4000207483768463, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 950 + }, + { + "epoch": 0.849933598937583, + "grad_norm": 0.35362836718559265, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 960 + }, + { + "epoch": 0.8587870739265162, + "grad_norm": 0.3470745086669922, + "learning_rate": 0.0002, + "loss": 1.7497, + "step": 970 + }, + { + "epoch": 0.8676405489154493, + "grad_norm": 0.31602704524993896, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 980 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.3062942326068878, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 990 + }, + { + "epoch": 0.8853474988933157, + "grad_norm": 0.36963850259780884, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1000 + }, + { + "epoch": 0.8942009738822487, + "grad_norm": 0.3384034037590027, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1010 + }, + { + "epoch": 0.9030544488711819, + "grad_norm": 0.30436110496520996, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 1020 + }, + { + "epoch": 0.9119079238601151, + "grad_norm": 3.499784469604492, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1030 + }, + { + "epoch": 0.9207613988490483, + "grad_norm": 0.3130280375480652, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1040 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.29976674914360046, + "learning_rate": 0.0002, + "loss": 1.7527, + "step": 1050 + }, + { + "epoch": 0.9384683488269145, + "grad_norm": 0.35852617025375366, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 1060 + }, + { + "epoch": 0.9473218238158477, + "grad_norm": 0.3288591504096985, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1070 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.32641634345054626, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1080 + }, + { + "epoch": 0.965028773793714, + "grad_norm": 0.3305715322494507, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1090 + }, + { + "epoch": 0.9738822487826472, + "grad_norm": 0.30650773644447327, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1100 + }, + { + "epoch": 0.9827357237715804, + "grad_norm": 0.3330624997615814, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1110 + }, + { + "epoch": 0.9915891987605135, + "grad_norm": 0.3173314034938812, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 1120 + }, + { + "epoch": 0.9995573262505534, + "eval_loss": 1.8095673322677612, + "eval_runtime": 82.6312, + "eval_samples_per_second": 6.233, + "eval_steps_per_second": 0.787, + "step": 1129 + }, + { + "epoch": 1.0004426737494467, + "grad_norm": 0.3092995882034302, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1130 + }, + { + "epoch": 1.0092961487383798, + "grad_norm": 0.34386494755744934, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1140 + }, + { + "epoch": 1.0181496237273129, + "grad_norm": 0.2887897789478302, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 1150 + }, + { + "epoch": 1.0270030987162462, + "grad_norm": 0.3706893026828766, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1160 + }, + { + "epoch": 1.0358565737051793, + "grad_norm": 0.34724316000938416, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 1170 + }, + { + "epoch": 1.0447100486941125, + "grad_norm": 0.41001757979393005, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1180 + }, + { + "epoch": 1.0535635236830456, + "grad_norm": 0.34838348627090454, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 1190 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 0.37201181054115295, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1200 + }, + { + "epoch": 1.071270473660912, + "grad_norm": 0.36871352791786194, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1210 + }, + { + "epoch": 1.080123948649845, + "grad_norm": 0.35687458515167236, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 1220 + }, + { + "epoch": 1.0889774236387781, + "grad_norm": 0.3864741921424866, + "learning_rate": 0.0002, + "loss": 1.7235, + "step": 1230 + }, + { + "epoch": 1.0978308986277114, + "grad_norm": 0.3496808707714081, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1240 + }, + { + "epoch": 1.1066843736166445, + "grad_norm": 0.3444930911064148, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 1250 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.353188693523407, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1260 + }, + { + "epoch": 1.1243913235945109, + "grad_norm": 0.3284400999546051, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1270 + }, + { + "epoch": 1.133244798583444, + "grad_norm": 0.3545348644256592, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1280 + }, + { + "epoch": 1.1420982735723773, + "grad_norm": 0.3489900529384613, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1290 + }, + { + "epoch": 1.1509517485613103, + "grad_norm": 0.40355560183525085, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 1300 + }, + { + "epoch": 1.1598052235502434, + "grad_norm": 0.3369944095611572, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 1310 + }, + { + "epoch": 1.1686586985391767, + "grad_norm": 0.39141345024108887, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1320 + }, + { + "epoch": 1.1775121735281098, + "grad_norm": 0.36518552899360657, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 1330 + }, + { + "epoch": 1.1863656485170428, + "grad_norm": 0.3730056583881378, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1340 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 0.37711501121520996, + "learning_rate": 0.0002, + "loss": 1.7613, + "step": 1350 + }, + { + "epoch": 1.2040725984949092, + "grad_norm": 0.3627128005027771, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 1360 + }, + { + "epoch": 1.2129260734838425, + "grad_norm": 0.3458651006221771, + "learning_rate": 0.0002, + "loss": 1.7214, + "step": 1370 + }, + { + "epoch": 1.2217795484727756, + "grad_norm": 0.392395555973053, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1380 + }, + { + "epoch": 1.2306330234617087, + "grad_norm": 0.3353286683559418, + "learning_rate": 0.0002, + "loss": 1.7785, + "step": 1390 + }, + { + "epoch": 1.239486498450642, + "grad_norm": 0.9545007944107056, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 1400 + }, + { + "epoch": 1.248339973439575, + "grad_norm": 0.37037935853004456, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1410 + }, + { + "epoch": 1.257193448428508, + "grad_norm": 0.3831497132778168, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1420 + }, + { + "epoch": 1.2660469234174414, + "grad_norm": 0.4633576273918152, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 1430 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.3690567910671234, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 1440 + }, + { + "epoch": 1.2837538733953076, + "grad_norm": 0.33980098366737366, + "learning_rate": 0.0002, + "loss": 1.767, + "step": 1450 + }, + { + "epoch": 1.2926073483842409, + "grad_norm": 0.3731277287006378, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1460 + }, + { + "epoch": 1.301460823373174, + "grad_norm": 0.3781551122665405, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 1470 + }, + { + "epoch": 1.310314298362107, + "grad_norm": 0.36511561274528503, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1480 + }, + { + "epoch": 1.3191677733510403, + "grad_norm": 0.3292245864868164, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1490 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 0.38758566975593567, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1500 + }, + { + "epoch": 1.3368747233289067, + "grad_norm": 0.3993414044380188, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1510 + }, + { + "epoch": 1.3457281983178397, + "grad_norm": 0.35689303278923035, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1520 + }, + { + "epoch": 1.354581673306773, + "grad_norm": 0.41849321126937866, + "learning_rate": 0.0002, + "loss": 1.7082, + "step": 1530 + }, + { + "epoch": 1.3634351482957061, + "grad_norm": 0.36752554774284363, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1540 + }, + { + "epoch": 1.3722886232846392, + "grad_norm": 0.36915940046310425, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 1550 + }, + { + "epoch": 1.3811420982735725, + "grad_norm": 0.3656710386276245, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1560 + }, + { + "epoch": 1.3899955732625056, + "grad_norm": 0.32055532932281494, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 1570 + }, + { + "epoch": 1.3988490482514386, + "grad_norm": 0.35031241178512573, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1580 + }, + { + "epoch": 1.407702523240372, + "grad_norm": 0.44541189074516296, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 1590 + }, + { + "epoch": 1.416555998229305, + "grad_norm": 0.36922356486320496, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 1600 + }, + { + "epoch": 1.425409473218238, + "grad_norm": 0.3470565974712372, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 1610 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.3743111193180084, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1620 + }, + { + "epoch": 1.4431164231961044, + "grad_norm": 0.3619250953197479, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1630 + }, + { + "epoch": 1.4519698981850375, + "grad_norm": 0.4028145968914032, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 1640 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.36065351963043213, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1650 + }, + { + "epoch": 1.469676848162904, + "grad_norm": 0.44304442405700684, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 1660 + }, + { + "epoch": 1.478530323151837, + "grad_norm": 0.35770007967948914, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1670 + }, + { + "epoch": 1.4873837981407703, + "grad_norm": 0.37584400177001953, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 1680 + }, + { + "epoch": 1.4962372731297033, + "grad_norm": 0.37151241302490234, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 1690 + }, + { + "epoch": 1.5050907481186364, + "grad_norm": 0.36422812938690186, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1700 + }, + { + "epoch": 1.5139442231075697, + "grad_norm": 0.3680015206336975, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1710 + }, + { + "epoch": 1.522797698096503, + "grad_norm": 0.3356926441192627, + "learning_rate": 0.0002, + "loss": 1.6917, + "step": 1720 + }, + { + "epoch": 1.531651173085436, + "grad_norm": 0.37887054681777954, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1730 + }, + { + "epoch": 1.5405046480743692, + "grad_norm": 0.37052762508392334, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1740 + }, + { + "epoch": 1.5493581230633025, + "grad_norm": 0.333925724029541, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 1750 + }, + { + "epoch": 1.5582115980522355, + "grad_norm": 0.3722778558731079, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1760 + }, + { + "epoch": 1.5670650730411686, + "grad_norm": 0.3331141173839569, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 1770 + }, + { + "epoch": 1.575918548030102, + "grad_norm": 0.3670045733451843, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1780 + }, + { + "epoch": 1.584772023019035, + "grad_norm": 0.3769885301589966, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1790 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.4266890287399292, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1800 + }, + { + "epoch": 1.6024789729969013, + "grad_norm": 0.37174347043037415, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1810 + }, + { + "epoch": 1.6113324479858344, + "grad_norm": 0.3599846363067627, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1820 + }, + { + "epoch": 1.6201859229747675, + "grad_norm": 0.3364820182323456, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1830 + }, + { + "epoch": 1.6290393979637008, + "grad_norm": 0.3874799907207489, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 1840 + }, + { + "epoch": 1.6378928729526339, + "grad_norm": 0.3706085681915283, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 1850 + }, + { + "epoch": 1.646746347941567, + "grad_norm": 0.3997809886932373, + "learning_rate": 0.0002, + "loss": 1.6761, + "step": 1860 + }, + { + "epoch": 1.6555998229305002, + "grad_norm": 0.4033166170120239, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 1870 + }, + { + "epoch": 1.6644532979194335, + "grad_norm": 0.3944370150566101, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1880 + }, + { + "epoch": 1.6733067729083664, + "grad_norm": 0.3467825651168823, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1890 + }, + { + "epoch": 1.6821602478972997, + "grad_norm": 0.35290950536727905, + "learning_rate": 0.0002, + "loss": 1.7462, + "step": 1900 + }, + { + "epoch": 1.691013722886233, + "grad_norm": 0.3664521872997284, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1910 + }, + { + "epoch": 1.699867197875166, + "grad_norm": 0.33863595128059387, + "learning_rate": 0.0002, + "loss": 1.7922, + "step": 1920 + }, + { + "epoch": 1.7087206728640991, + "grad_norm": 0.34726113080978394, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 1930 + }, + { + "epoch": 1.7175741478530324, + "grad_norm": 0.35060688853263855, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1940 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 0.33741647005081177, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1950 + }, + { + "epoch": 1.7352810978308986, + "grad_norm": 0.36190304160118103, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1960 + }, + { + "epoch": 1.7441345728198319, + "grad_norm": 0.3412845730781555, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 1970 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 0.3841935694217682, + "learning_rate": 0.0002, + "loss": 1.7038, + "step": 1980 + }, + { + "epoch": 1.761841522797698, + "grad_norm": 0.39062076807022095, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 1990 + }, + { + "epoch": 1.7706949977866313, + "grad_norm": 0.3741697669029236, + "learning_rate": 0.0002, + "loss": 1.7346, + "step": 2000 + }, + { + "epoch": 1.7795484727755644, + "grad_norm": 0.4160231053829193, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2010 + }, + { + "epoch": 1.7884019477644975, + "grad_norm": 0.3602111339569092, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 2020 + }, + { + "epoch": 1.7972554227534308, + "grad_norm": 0.36740878224372864, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2030 + }, + { + "epoch": 1.8061088977423638, + "grad_norm": 0.419039249420166, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 2040 + }, + { + "epoch": 1.814962372731297, + "grad_norm": 0.3511838912963867, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 2050 + }, + { + "epoch": 1.8238158477202302, + "grad_norm": 0.3580166697502136, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2060 + }, + { + "epoch": 1.8326693227091635, + "grad_norm": 0.40928223729133606, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 2070 + }, + { + "epoch": 1.8415227976980963, + "grad_norm": 0.37134310603141785, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 2080 + }, + { + "epoch": 1.8503762726870296, + "grad_norm": 0.3924112319946289, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2090 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.3215042054653168, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 2100 + }, + { + "epoch": 1.868083222664896, + "grad_norm": 0.37674015760421753, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2110 + }, + { + "epoch": 1.876936697653829, + "grad_norm": 0.370856374502182, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 2120 + }, + { + "epoch": 1.8857901726427624, + "grad_norm": 0.35783782601356506, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2130 + }, + { + "epoch": 1.8946436476316955, + "grad_norm": 0.39538058638572693, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 2140 + }, + { + "epoch": 1.9034971226206285, + "grad_norm": 0.36677780747413635, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 2150 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.39032700657844543, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 2160 + }, + { + "epoch": 1.921204072598495, + "grad_norm": 0.39762043952941895, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2170 + }, + { + "epoch": 1.930057547587428, + "grad_norm": 0.5400257110595703, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 2180 + }, + { + "epoch": 1.9389110225763613, + "grad_norm": 0.3650212287902832, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2190 + }, + { + "epoch": 1.9477644975652944, + "grad_norm": 0.3583165109157562, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.9566179725542274, + "grad_norm": 0.4031282365322113, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 2210 + }, + { + "epoch": 1.9654714475431607, + "grad_norm": 0.3673221170902252, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 2220 + }, + { + "epoch": 1.9743249225320938, + "grad_norm": 0.3920327126979828, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 2230 + }, + { + "epoch": 1.9831783975210269, + "grad_norm": 0.4765491783618927, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2240 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.38130584359169006, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2250 + }, + { + "epoch": 2.0, + "eval_loss": 1.8077166080474854, + "eval_runtime": 82.8351, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 0.785, + "step": 2259 + }, + { + "epoch": 2.0008853474988935, + "grad_norm": 0.34340235590934753, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 2260 + }, + { + "epoch": 2.0097388224878263, + "grad_norm": 0.3710762858390808, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2270 + }, + { + "epoch": 2.0185922974767596, + "grad_norm": 0.35640114545822144, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2280 + }, + { + "epoch": 2.027445772465693, + "grad_norm": 0.45970189571380615, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 2290 + }, + { + "epoch": 2.0362992474546258, + "grad_norm": 0.4256797134876251, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 2300 + }, + { + "epoch": 2.045152722443559, + "grad_norm": 0.42421531677246094, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2310 + }, + { + "epoch": 2.0540061974324924, + "grad_norm": 0.4032478928565979, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2320 + }, + { + "epoch": 2.062859672421425, + "grad_norm": 0.4073623716831207, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2330 + }, + { + "epoch": 2.0717131474103585, + "grad_norm": 0.4845200777053833, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2340 + }, + { + "epoch": 2.080566622399292, + "grad_norm": 0.40578293800354004, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2350 + }, + { + "epoch": 2.089420097388225, + "grad_norm": 0.4037284255027771, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2360 + }, + { + "epoch": 2.098273572377158, + "grad_norm": 0.4717613160610199, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2370 + }, + { + "epoch": 2.1071270473660912, + "grad_norm": 0.42076411843299866, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 2380 + }, + { + "epoch": 2.1159805223550245, + "grad_norm": 0.47799113392829895, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 2390 + }, + { + "epoch": 2.1248339973439574, + "grad_norm": 0.4253084063529968, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 2400 + }, + { + "epoch": 2.1336874723328907, + "grad_norm": 0.5023085474967957, + "learning_rate": 0.0002, + "loss": 1.6432, + "step": 2410 + }, + { + "epoch": 2.142540947321824, + "grad_norm": 0.49162712693214417, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2420 + }, + { + "epoch": 2.151394422310757, + "grad_norm": 0.39035019278526306, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2430 + }, + { + "epoch": 2.16024789729969, + "grad_norm": 0.43223854899406433, + "learning_rate": 0.0002, + "loss": 1.7526, + "step": 2440 + }, + { + "epoch": 2.1691013722886234, + "grad_norm": 0.4596616327762604, + "learning_rate": 0.0002, + "loss": 1.6334, + "step": 2450 + }, + { + "epoch": 2.1779548472775563, + "grad_norm": 0.4469447731971741, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 2460 + }, + { + "epoch": 2.1868083222664896, + "grad_norm": 0.5100595355033875, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 2470 + }, + { + "epoch": 2.195661797255423, + "grad_norm": 0.4169430732727051, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2480 + }, + { + "epoch": 2.2045152722443557, + "grad_norm": 0.4699254035949707, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 2490 + }, + { + "epoch": 2.213368747233289, + "grad_norm": 0.43524250388145447, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 2500 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.4496648907661438, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2510 + }, + { + "epoch": 2.231075697211155, + "grad_norm": 0.43408212065696716, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2520 + }, + { + "epoch": 2.2399291722000885, + "grad_norm": 0.4596034288406372, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2530 + }, + { + "epoch": 2.2487826471890218, + "grad_norm": 0.5217021107673645, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2540 + }, + { + "epoch": 2.2576361221779546, + "grad_norm": 0.44745638966560364, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 2550 + }, + { + "epoch": 2.266489597166888, + "grad_norm": 0.4484798014163971, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 2560 + }, + { + "epoch": 2.275343072155821, + "grad_norm": 0.4428067207336426, + "learning_rate": 0.0002, + "loss": 1.5321, + "step": 2570 + }, + { + "epoch": 2.2841965471447545, + "grad_norm": 0.5095171332359314, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2580 + }, + { + "epoch": 2.2930500221336874, + "grad_norm": 0.44833096861839294, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 2590 + }, + { + "epoch": 2.3019034971226207, + "grad_norm": 0.507905900478363, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 2600 + }, + { + "epoch": 2.310756972111554, + "grad_norm": 0.40808171033859253, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2610 + }, + { + "epoch": 2.319610447100487, + "grad_norm": 0.4684814214706421, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2620 + }, + { + "epoch": 2.32846392208942, + "grad_norm": 0.44864922761917114, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2630 + }, + { + "epoch": 2.3373173970783534, + "grad_norm": 0.4174162745475769, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2640 + }, + { + "epoch": 2.3461708720672863, + "grad_norm": 0.42314743995666504, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 2650 + }, + { + "epoch": 2.3550243470562195, + "grad_norm": 0.49224185943603516, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2660 + }, + { + "epoch": 2.363877822045153, + "grad_norm": 0.45190292596817017, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 2670 + }, + { + "epoch": 2.3727312970340857, + "grad_norm": 0.41817107796669006, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 2680 + }, + { + "epoch": 2.381584772023019, + "grad_norm": 0.6436763405799866, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2690 + }, + { + "epoch": 2.3904382470119523, + "grad_norm": 0.47175949811935425, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2700 + }, + { + "epoch": 2.3992917220008856, + "grad_norm": 0.480339378118515, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 2710 + }, + { + "epoch": 2.4081451969898184, + "grad_norm": 0.4723486006259918, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 2720 + }, + { + "epoch": 2.4169986719787517, + "grad_norm": 0.4305492043495178, + "learning_rate": 0.0002, + "loss": 1.54, + "step": 2730 + }, + { + "epoch": 2.425852146967685, + "grad_norm": 0.5007492303848267, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2740 + }, + { + "epoch": 2.434705621956618, + "grad_norm": 0.5374062061309814, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 2750 + }, + { + "epoch": 2.443559096945551, + "grad_norm": 0.45866212248802185, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 2760 + }, + { + "epoch": 2.4524125719344845, + "grad_norm": 0.47914502024650574, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 2770 + }, + { + "epoch": 2.4612660469234173, + "grad_norm": 0.43804746866226196, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 2780 + }, + { + "epoch": 2.4701195219123506, + "grad_norm": 0.43656906485557556, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 2790 + }, + { + "epoch": 2.478972996901284, + "grad_norm": 0.4820363521575928, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 2800 + }, + { + "epoch": 2.4878264718902168, + "grad_norm": 0.4916800558567047, + "learning_rate": 0.0002, + "loss": 1.5375, + "step": 2810 + }, + { + "epoch": 2.49667994687915, + "grad_norm": 0.4521256983280182, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2820 + }, + { + "epoch": 2.5055334218680834, + "grad_norm": 0.5066806674003601, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 2830 + }, + { + "epoch": 2.514386896857016, + "grad_norm": 0.4768151640892029, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2840 + }, + { + "epoch": 2.5232403718459495, + "grad_norm": 0.5144683718681335, + "learning_rate": 0.0002, + "loss": 1.6719, + "step": 2850 + }, + { + "epoch": 2.532093846834883, + "grad_norm": 0.4718942940235138, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2860 + }, + { + "epoch": 2.5409473218238157, + "grad_norm": 0.4924587309360504, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2870 + }, + { + "epoch": 2.549800796812749, + "grad_norm": 0.4649953842163086, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2880 + }, + { + "epoch": 2.5586542718016823, + "grad_norm": 0.4836665987968445, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2890 + }, + { + "epoch": 2.567507746790615, + "grad_norm": 0.4162124991416931, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 2900 + }, + { + "epoch": 2.5763612217795484, + "grad_norm": 0.4894537925720215, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2910 + }, + { + "epoch": 2.5852146967684817, + "grad_norm": 0.4539397358894348, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 2920 + }, + { + "epoch": 2.5940681717574146, + "grad_norm": 0.4718773066997528, + "learning_rate": 0.0002, + "loss": 1.6449, + "step": 2930 + }, + { + "epoch": 2.602921646746348, + "grad_norm": 0.49989837408065796, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 2940 + }, + { + "epoch": 2.611775121735281, + "grad_norm": 0.4862406849861145, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2950 + }, + { + "epoch": 2.620628596724214, + "grad_norm": 0.4244804382324219, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 2960 + }, + { + "epoch": 2.6294820717131473, + "grad_norm": 0.49304354190826416, + "learning_rate": 0.0002, + "loss": 1.7795, + "step": 2970 + }, + { + "epoch": 2.6383355467020806, + "grad_norm": 0.4818236529827118, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 2980 + }, + { + "epoch": 2.647189021691014, + "grad_norm": 0.5077425837516785, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 2990 + }, + { + "epoch": 2.6560424966799467, + "grad_norm": 0.4494157135486603, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 3000 + }, + { + "epoch": 2.66489597166888, + "grad_norm": 0.4790278971195221, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 3010 + }, + { + "epoch": 2.6737494466578133, + "grad_norm": 0.4702624976634979, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3020 + }, + { + "epoch": 2.682602921646746, + "grad_norm": 0.5082133412361145, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3030 + }, + { + "epoch": 2.6914563966356795, + "grad_norm": 0.4553256630897522, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 3040 + }, + { + "epoch": 2.700309871624613, + "grad_norm": 0.4492715001106262, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3050 + }, + { + "epoch": 2.709163346613546, + "grad_norm": 0.4555944502353668, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 3060 + }, + { + "epoch": 2.718016821602479, + "grad_norm": 0.5879693031311035, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3070 + }, + { + "epoch": 2.7268702965914122, + "grad_norm": 0.4628562927246094, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3080 + }, + { + "epoch": 2.7357237715803455, + "grad_norm": 0.5169575810432434, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 3090 + }, + { + "epoch": 2.7445772465692784, + "grad_norm": 0.4630090892314911, + "learning_rate": 0.0002, + "loss": 1.562, + "step": 3100 + }, + { + "epoch": 2.7534307215582117, + "grad_norm": 0.5437219738960266, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 3110 + }, + { + "epoch": 2.762284196547145, + "grad_norm": 0.5102152228355408, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 3120 + }, + { + "epoch": 2.771137671536078, + "grad_norm": 0.48287826776504517, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3130 + }, + { + "epoch": 2.779991146525011, + "grad_norm": 0.4671737253665924, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 3140 + }, + { + "epoch": 2.7888446215139444, + "grad_norm": 0.5177035331726074, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3150 + }, + { + "epoch": 2.7976980965028773, + "grad_norm": 0.450989305973053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 3160 + }, + { + "epoch": 2.8065515714918106, + "grad_norm": 0.45007848739624023, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3170 + }, + { + "epoch": 2.815405046480744, + "grad_norm": 0.4600294530391693, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3180 + }, + { + "epoch": 2.8242585214696767, + "grad_norm": 0.485628604888916, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 3190 + }, + { + "epoch": 2.83311199645861, + "grad_norm": 0.49811574816703796, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 3200 + }, + { + "epoch": 2.8419654714475433, + "grad_norm": 0.5012516975402832, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 3210 + }, + { + "epoch": 2.850818946436476, + "grad_norm": 0.4552757740020752, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 3220 + }, + { + "epoch": 2.8596724214254094, + "grad_norm": 0.4539635479450226, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 3230 + }, + { + "epoch": 2.8685258964143427, + "grad_norm": 0.5534685850143433, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 3240 + }, + { + "epoch": 2.8773793714032756, + "grad_norm": 0.4570811688899994, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3250 + }, + { + "epoch": 2.886232846392209, + "grad_norm": 0.48181653022766113, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3260 + }, + { + "epoch": 2.895086321381142, + "grad_norm": 0.4871032238006592, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 3270 + }, + { + "epoch": 2.903939796370075, + "grad_norm": 0.4643239676952362, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3280 + }, + { + "epoch": 2.9127932713590083, + "grad_norm": 0.5024484395980835, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3290 + }, + { + "epoch": 2.9216467463479416, + "grad_norm": 0.4425384998321533, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 3300 + }, + { + "epoch": 2.9305002213368745, + "grad_norm": 0.459168016910553, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 3310 + }, + { + "epoch": 2.939353696325808, + "grad_norm": 0.4950717091560364, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 3320 + }, + { + "epoch": 2.948207171314741, + "grad_norm": 0.4516230523586273, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3330 + }, + { + "epoch": 2.957060646303674, + "grad_norm": 0.49523285031318665, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3340 + }, + { + "epoch": 2.9659141212926072, + "grad_norm": 0.49282631278038025, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 3350 + }, + { + "epoch": 2.9747675962815405, + "grad_norm": 0.45825016498565674, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 3360 + }, + { + "epoch": 2.983621071270474, + "grad_norm": 0.4952891170978546, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 3370 + }, + { + "epoch": 2.9924745462594067, + "grad_norm": 0.42182639241218567, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3380 + }, + { + "epoch": 2.9995573262505535, + "eval_loss": 1.8308420181274414, + "eval_runtime": 82.786, + "eval_samples_per_second": 6.221, + "eval_steps_per_second": 0.785, + "step": 3388 + }, + { + "epoch": 3.00132802124834, + "grad_norm": 0.47721418738365173, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 3390 + }, + { + "epoch": 3.0101814962372733, + "grad_norm": 0.5284923911094666, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 3400 + }, + { + "epoch": 3.019034971226206, + "grad_norm": 0.5607061982154846, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 3410 + }, + { + "epoch": 3.0278884462151394, + "grad_norm": 0.5271363258361816, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 3420 + }, + { + "epoch": 3.0367419212040727, + "grad_norm": 0.48660898208618164, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 3430 + }, + { + "epoch": 3.0455953961930056, + "grad_norm": 0.5767933130264282, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 3440 + }, + { + "epoch": 3.054448871181939, + "grad_norm": 0.5591282248497009, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3450 + }, + { + "epoch": 3.063302346170872, + "grad_norm": 0.5870814323425293, + "learning_rate": 0.0002, + "loss": 1.5112, + "step": 3460 + }, + { + "epoch": 3.072155821159805, + "grad_norm": 0.4861546456813812, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 3470 + }, + { + "epoch": 3.0810092961487383, + "grad_norm": 0.5238925814628601, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 3480 + }, + { + "epoch": 3.0898627711376716, + "grad_norm": 0.5521751046180725, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 3490 + }, + { + "epoch": 3.098716246126605, + "grad_norm": 0.5816575884819031, + "learning_rate": 0.0002, + "loss": 1.4454, + "step": 3500 + }, + { + "epoch": 3.1075697211155378, + "grad_norm": 0.5281513333320618, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 3510 + }, + { + "epoch": 3.116423196104471, + "grad_norm": 0.5847303867340088, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 3520 + }, + { + "epoch": 3.1252766710934043, + "grad_norm": 0.5683517456054688, + "learning_rate": 0.0002, + "loss": 1.5513, + "step": 3530 + }, + { + "epoch": 3.134130146082337, + "grad_norm": 0.5177015662193298, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 3540 + }, + { + "epoch": 3.1429836210712705, + "grad_norm": 0.5922423601150513, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 3550 + }, + { + "epoch": 3.151837096060204, + "grad_norm": 0.7018587589263916, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 3560 + }, + { + "epoch": 3.1606905710491366, + "grad_norm": 0.6152004599571228, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 3570 + }, + { + "epoch": 3.16954404603807, + "grad_norm": 0.5350717902183533, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 3580 + }, + { + "epoch": 3.1783975210270032, + "grad_norm": 0.5971009731292725, + "learning_rate": 0.0002, + "loss": 1.4739, + "step": 3590 + }, + { + "epoch": 3.187250996015936, + "grad_norm": 0.7312001585960388, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3600 + }, + { + "epoch": 3.1961044710048694, + "grad_norm": 0.6372535228729248, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 3610 + }, + { + "epoch": 3.2049579459938027, + "grad_norm": 0.6098020672798157, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 3620 + }, + { + "epoch": 3.2138114209827355, + "grad_norm": 0.5506435632705688, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3630 + }, + { + "epoch": 3.222664895971669, + "grad_norm": 0.6043022274971008, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3640 + }, + { + "epoch": 3.231518370960602, + "grad_norm": 0.5495519042015076, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3650 + }, + { + "epoch": 3.240371845949535, + "grad_norm": 0.5769572257995605, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3660 + }, + { + "epoch": 3.2492253209384683, + "grad_norm": 0.6833786964416504, + "learning_rate": 0.0002, + "loss": 1.4604, + "step": 3670 + }, + { + "epoch": 3.2580787959274016, + "grad_norm": 0.6962856650352478, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3680 + }, + { + "epoch": 3.2669322709163344, + "grad_norm": 0.6553098559379578, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 3690 + }, + { + "epoch": 3.2757857459052677, + "grad_norm": 0.5907557010650635, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 3700 + }, + { + "epoch": 3.284639220894201, + "grad_norm": 0.5712862014770508, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 3710 + }, + { + "epoch": 3.2934926958831343, + "grad_norm": 0.573820948600769, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 3720 + }, + { + "epoch": 3.302346170872067, + "grad_norm": 0.6650304198265076, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 3730 + }, + { + "epoch": 3.3111996458610005, + "grad_norm": 0.5182583928108215, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 3740 + }, + { + "epoch": 3.3200531208499338, + "grad_norm": 0.5078902840614319, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 3750 + }, + { + "epoch": 3.3289065958388666, + "grad_norm": 0.7062374353408813, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 3760 + }, + { + "epoch": 3.3377600708278, + "grad_norm": 0.5711262822151184, + "learning_rate": 0.0002, + "loss": 1.5017, + "step": 3770 + }, + { + "epoch": 3.346613545816733, + "grad_norm": 0.5624606013298035, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 3780 + }, + { + "epoch": 3.355467020805666, + "grad_norm": 0.6008231043815613, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 3790 + }, + { + "epoch": 3.3643204957945994, + "grad_norm": 0.6120018362998962, + "learning_rate": 0.0002, + "loss": 1.5038, + "step": 3800 + }, + { + "epoch": 3.3731739707835326, + "grad_norm": 0.5679979920387268, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3810 + }, + { + "epoch": 3.3820274457724655, + "grad_norm": 0.5613794922828674, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3820 + }, + { + "epoch": 3.390880920761399, + "grad_norm": 0.5328839421272278, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 3830 + }, + { + "epoch": 3.399734395750332, + "grad_norm": 0.5960017442703247, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 3840 + }, + { + "epoch": 3.4085878707392654, + "grad_norm": 0.5264106392860413, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 3850 + }, + { + "epoch": 3.4174413457281982, + "grad_norm": 0.6378359198570251, + "learning_rate": 0.0002, + "loss": 1.4766, + "step": 3860 + }, + { + "epoch": 3.4262948207171315, + "grad_norm": 0.5792967677116394, + "learning_rate": 0.0002, + "loss": 1.4898, + "step": 3870 + }, + { + "epoch": 3.435148295706065, + "grad_norm": 0.6836280822753906, + "learning_rate": 0.0002, + "loss": 1.4914, + "step": 3880 + }, + { + "epoch": 3.4440017706949977, + "grad_norm": 0.6073971390724182, + "learning_rate": 0.0002, + "loss": 1.5002, + "step": 3890 + }, + { + "epoch": 3.452855245683931, + "grad_norm": 0.5753195881843567, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 3900 + }, + { + "epoch": 3.4617087206728643, + "grad_norm": 0.6007646918296814, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3910 + }, + { + "epoch": 3.470562195661797, + "grad_norm": 0.6025636196136475, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3920 + }, + { + "epoch": 3.4794156706507304, + "grad_norm": 0.6819562315940857, + "learning_rate": 0.0002, + "loss": 1.4612, + "step": 3930 + }, + { + "epoch": 3.4882691456396637, + "grad_norm": 0.6448395848274231, + "learning_rate": 0.0002, + "loss": 1.518, + "step": 3940 + }, + { + "epoch": 3.4971226206285966, + "grad_norm": 0.5712178945541382, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 3950 + }, + { + "epoch": 3.50597609561753, + "grad_norm": 0.6300532817840576, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 3960 + }, + { + "epoch": 3.514829570606463, + "grad_norm": 0.6120840907096863, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 3970 + }, + { + "epoch": 3.523683045595396, + "grad_norm": 0.6887575387954712, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 3980 + }, + { + "epoch": 3.5325365205843293, + "grad_norm": 0.6970235109329224, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 3990 + }, + { + "epoch": 3.5413899955732626, + "grad_norm": 0.5818213820457458, + "learning_rate": 0.0002, + "loss": 1.5198, + "step": 4000 + }, + { + "epoch": 3.5502434705621955, + "grad_norm": 1.0533310174942017, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 4010 + }, + { + "epoch": 3.5590969455511288, + "grad_norm": 0.5444280505180359, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4020 + }, + { + "epoch": 3.567950420540062, + "grad_norm": 0.6007506847381592, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 4030 + }, + { + "epoch": 3.576803895528995, + "grad_norm": 0.6088743805885315, + "learning_rate": 0.0002, + "loss": 1.5059, + "step": 4040 + }, + { + "epoch": 3.585657370517928, + "grad_norm": 0.5934239029884338, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 4050 + }, + { + "epoch": 3.5945108455068615, + "grad_norm": 0.605251669883728, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 4060 + }, + { + "epoch": 3.6033643204957944, + "grad_norm": 0.5903469920158386, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 4070 + }, + { + "epoch": 3.6122177954847277, + "grad_norm": 0.6752413511276245, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4080 + }, + { + "epoch": 3.621071270473661, + "grad_norm": 0.5810418725013733, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4090 + }, + { + "epoch": 3.629924745462594, + "grad_norm": 0.5918573141098022, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 4100 + }, + { + "epoch": 3.638778220451527, + "grad_norm": 0.6635358333587646, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 4110 + }, + { + "epoch": 3.6476316954404604, + "grad_norm": 0.5785038471221924, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4120 + }, + { + "epoch": 3.6564851704293937, + "grad_norm": 0.5837879776954651, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4130 + }, + { + "epoch": 3.6653386454183265, + "grad_norm": 0.6449324488639832, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4140 + }, + { + "epoch": 3.67419212040726, + "grad_norm": 0.6191908717155457, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4150 + }, + { + "epoch": 3.683045595396193, + "grad_norm": 0.6937987208366394, + "learning_rate": 0.0002, + "loss": 1.4567, + "step": 4160 + }, + { + "epoch": 3.6918990703851264, + "grad_norm": 0.581128716468811, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 4170 + }, + { + "epoch": 3.7007525453740593, + "grad_norm": 0.6547803282737732, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 4180 + }, + { + "epoch": 3.7096060203629926, + "grad_norm": 0.5961150527000427, + "learning_rate": 0.0002, + "loss": 1.4653, + "step": 4190 + }, + { + "epoch": 3.718459495351926, + "grad_norm": 0.6197913885116577, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 4200 + }, + { + "epoch": 3.7273129703408587, + "grad_norm": 0.688565194606781, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 4210 + }, + { + "epoch": 3.736166445329792, + "grad_norm": 0.5832270979881287, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 4220 + }, + { + "epoch": 3.7450199203187253, + "grad_norm": 0.5643884539604187, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 4230 + }, + { + "epoch": 3.753873395307658, + "grad_norm": 0.6236484050750732, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4240 + }, + { + "epoch": 3.7627268702965915, + "grad_norm": 0.5367720127105713, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 4250 + }, + { + "epoch": 3.7715803452855248, + "grad_norm": 0.5785109400749207, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4260 + }, + { + "epoch": 3.7804338202744576, + "grad_norm": 0.5698465704917908, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4270 + }, + { + "epoch": 3.789287295263391, + "grad_norm": 0.5748036503791809, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4280 + }, + { + "epoch": 3.798140770252324, + "grad_norm": 0.608147382736206, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 4290 + }, + { + "epoch": 3.806994245241257, + "grad_norm": 0.5820456147193909, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4300 + }, + { + "epoch": 3.8158477202301904, + "grad_norm": 0.6325612664222717, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4310 + }, + { + "epoch": 3.8247011952191237, + "grad_norm": 0.6465362310409546, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4320 + }, + { + "epoch": 3.8335546702080565, + "grad_norm": 0.5630854368209839, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4330 + }, + { + "epoch": 3.84240814519699, + "grad_norm": 0.6181462407112122, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4340 + }, + { + "epoch": 3.851261620185923, + "grad_norm": 0.6207571029663086, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 4350 + }, + { + "epoch": 3.860115095174856, + "grad_norm": 0.6092919111251831, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4360 + }, + { + "epoch": 3.8689685701637893, + "grad_norm": 0.6140493750572205, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 4370 + }, + { + "epoch": 3.8778220451527226, + "grad_norm": 0.611575722694397, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 4380 + }, + { + "epoch": 3.8866755201416554, + "grad_norm": 0.6288794279098511, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 4390 + }, + { + "epoch": 3.8955289951305887, + "grad_norm": 0.6518979072570801, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4400 + }, + { + "epoch": 3.904382470119522, + "grad_norm": 0.6144753098487854, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4410 + }, + { + "epoch": 3.913235945108455, + "grad_norm": 0.7034937143325806, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 4420 + }, + { + "epoch": 3.922089420097388, + "grad_norm": 0.5713187456130981, + "learning_rate": 0.0002, + "loss": 1.4978, + "step": 4430 + }, + { + "epoch": 3.9309428950863214, + "grad_norm": 0.6187576651573181, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4440 + }, + { + "epoch": 3.9397963700752543, + "grad_norm": 0.6439383029937744, + "learning_rate": 0.0002, + "loss": 1.551, + "step": 4450 + }, + { + "epoch": 3.9486498450641876, + "grad_norm": 0.6133334636688232, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 4460 + }, + { + "epoch": 3.957503320053121, + "grad_norm": 0.593463659286499, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4470 + }, + { + "epoch": 3.9663567950420537, + "grad_norm": 0.6261998414993286, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4480 + }, + { + "epoch": 3.975210270030987, + "grad_norm": 0.6153767704963684, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4490 + }, + { + "epoch": 3.9840637450199203, + "grad_norm": 0.6184002757072449, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 4500 + }, + { + "epoch": 3.9929172200088536, + "grad_norm": 0.5212734341621399, + "learning_rate": 0.0002, + "loss": 1.5134, + "step": 4510 + }, + { + "epoch": 4.0, + "eval_loss": 1.8745536804199219, + "eval_runtime": 83.0125, + "eval_samples_per_second": 6.204, + "eval_steps_per_second": 0.783, + "step": 4518 + }, + { + "epoch": 4.001770694997787, + "grad_norm": 0.5871603488922119, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4520 + }, + { + "epoch": 4.01062416998672, + "grad_norm": 0.6746091842651367, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 4530 + }, + { + "epoch": 4.019477644975653, + "grad_norm": 0.6159639358520508, + "learning_rate": 0.0002, + "loss": 1.3625, + "step": 4540 + }, + { + "epoch": 4.028331119964586, + "grad_norm": 0.7529398202896118, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 4550 + }, + { + "epoch": 4.037184594953519, + "grad_norm": 0.788398027420044, + "learning_rate": 0.0002, + "loss": 1.3202, + "step": 4560 + }, + { + "epoch": 4.046038069942452, + "grad_norm": 0.9679850935935974, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 4570 + }, + { + "epoch": 4.054891544931386, + "grad_norm": 0.6305310130119324, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 4580 + }, + { + "epoch": 4.063745019920319, + "grad_norm": 0.8557451963424683, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 4590 + }, + { + "epoch": 4.0725984949092515, + "grad_norm": 0.741518497467041, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4600 + }, + { + "epoch": 4.081451969898185, + "grad_norm": 0.6573862433433533, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 4610 + }, + { + "epoch": 4.090305444887118, + "grad_norm": 0.6926319599151611, + "learning_rate": 0.0002, + "loss": 1.3341, + "step": 4620 + }, + { + "epoch": 4.099158919876051, + "grad_norm": 0.9212626218795776, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4630 + }, + { + "epoch": 4.108012394864985, + "grad_norm": 0.7167867422103882, + "learning_rate": 0.0002, + "loss": 1.3402, + "step": 4640 + }, + { + "epoch": 4.116865869853918, + "grad_norm": 0.6691595911979675, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 4650 + }, + { + "epoch": 4.12571934484285, + "grad_norm": 0.8708247542381287, + "learning_rate": 0.0002, + "loss": 1.247, + "step": 4660 + }, + { + "epoch": 4.134572819831784, + "grad_norm": 0.8612170219421387, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 4670 + }, + { + "epoch": 4.143426294820717, + "grad_norm": 0.7688325047492981, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 4680 + }, + { + "epoch": 4.152279769809651, + "grad_norm": 0.7606917023658752, + "learning_rate": 0.0002, + "loss": 1.4349, + "step": 4690 + }, + { + "epoch": 4.161133244798584, + "grad_norm": 0.8241282105445862, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 4700 + }, + { + "epoch": 4.1699867197875164, + "grad_norm": 0.7480464577674866, + "learning_rate": 0.0002, + "loss": 1.3325, + "step": 4710 + }, + { + "epoch": 4.17884019477645, + "grad_norm": 0.7092460989952087, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 4720 + }, + { + "epoch": 4.187693669765383, + "grad_norm": 0.8782108426094055, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4730 + }, + { + "epoch": 4.196547144754316, + "grad_norm": 0.6875300407409668, + "learning_rate": 0.0002, + "loss": 1.3626, + "step": 4740 + }, + { + "epoch": 4.20540061974325, + "grad_norm": 0.7713887691497803, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 4750 + }, + { + "epoch": 4.2142540947321825, + "grad_norm": 0.8270819783210754, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 4760 + }, + { + "epoch": 4.223107569721115, + "grad_norm": 0.7109288573265076, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 4770 + }, + { + "epoch": 4.231961044710049, + "grad_norm": 0.7209359407424927, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 4780 + }, + { + "epoch": 4.240814519698982, + "grad_norm": 0.7142833471298218, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 4790 + }, + { + "epoch": 4.249667994687915, + "grad_norm": 0.8526809811592102, + "learning_rate": 0.0002, + "loss": 1.3654, + "step": 4800 + }, + { + "epoch": 4.2585214696768485, + "grad_norm": 0.7064695954322815, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 4810 + }, + { + "epoch": 4.267374944665781, + "grad_norm": 0.7646124362945557, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 4820 + }, + { + "epoch": 4.276228419654714, + "grad_norm": 0.7377115488052368, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 4830 + }, + { + "epoch": 4.285081894643648, + "grad_norm": 0.7308453321456909, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 4840 + }, + { + "epoch": 4.293935369632581, + "grad_norm": 0.6687684059143066, + "learning_rate": 0.0002, + "loss": 1.3653, + "step": 4850 + }, + { + "epoch": 4.302788844621514, + "grad_norm": 0.7447634339332581, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 4860 + }, + { + "epoch": 4.311642319610447, + "grad_norm": 0.7661601305007935, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 4870 + }, + { + "epoch": 4.32049579459938, + "grad_norm": 0.7492215037345886, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 4880 + }, + { + "epoch": 4.329349269588313, + "grad_norm": 0.9554458856582642, + "learning_rate": 0.0002, + "loss": 1.4089, + "step": 4890 + }, + { + "epoch": 4.338202744577247, + "grad_norm": 0.7409822940826416, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 4900 + }, + { + "epoch": 4.34705621956618, + "grad_norm": 0.9848645329475403, + "learning_rate": 0.0002, + "loss": 1.2581, + "step": 4910 + }, + { + "epoch": 4.355909694555113, + "grad_norm": 0.803995668888092, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4920 + }, + { + "epoch": 4.364763169544046, + "grad_norm": 0.7480606436729431, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 4930 + }, + { + "epoch": 4.373616644532979, + "grad_norm": 0.7018141150474548, + "learning_rate": 0.0002, + "loss": 1.4092, + "step": 4940 + }, + { + "epoch": 4.382470119521912, + "grad_norm": 0.7684932351112366, + "learning_rate": 0.0002, + "loss": 1.4034, + "step": 4950 + }, + { + "epoch": 4.391323594510846, + "grad_norm": 0.7849185466766357, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 4960 + }, + { + "epoch": 4.400177069499779, + "grad_norm": 0.7858862280845642, + "learning_rate": 0.0002, + "loss": 1.3763, + "step": 4970 + }, + { + "epoch": 4.4090305444887115, + "grad_norm": 0.8270778059959412, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4980 + }, + { + "epoch": 4.417884019477645, + "grad_norm": 0.8464101552963257, + "learning_rate": 0.0002, + "loss": 1.445, + "step": 4990 + }, + { + "epoch": 4.426737494466578, + "grad_norm": 0.85670405626297, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5000 + }, + { + "epoch": 4.435590969455511, + "grad_norm": 0.8656655550003052, + "learning_rate": 0.0002, + "loss": 1.4203, + "step": 5010 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7605292201042175, + "learning_rate": 0.0002, + "loss": 1.3426, + "step": 5020 + }, + { + "epoch": 4.4532979194333775, + "grad_norm": 0.7682471871376038, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 5030 + }, + { + "epoch": 4.46215139442231, + "grad_norm": 0.7209102511405945, + "learning_rate": 0.0002, + "loss": 1.3432, + "step": 5040 + }, + { + "epoch": 4.471004869411244, + "grad_norm": 0.8259989023208618, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 5050 + }, + { + "epoch": 4.479858344400177, + "grad_norm": 0.7342197895050049, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 5060 + }, + { + "epoch": 4.48871181938911, + "grad_norm": 0.7869040369987488, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 5070 + }, + { + "epoch": 4.4975652943780435, + "grad_norm": 0.7906143665313721, + "learning_rate": 0.0002, + "loss": 1.3734, + "step": 5080 + }, + { + "epoch": 4.506418769366976, + "grad_norm": 0.7336861491203308, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 5090 + }, + { + "epoch": 4.515272244355909, + "grad_norm": 0.8264166712760925, + "learning_rate": 0.0002, + "loss": 1.3768, + "step": 5100 + }, + { + "epoch": 4.524125719344843, + "grad_norm": 0.8144693970680237, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 5110 + }, + { + "epoch": 4.532979194333776, + "grad_norm": 0.8257269263267517, + "learning_rate": 0.0002, + "loss": 1.3044, + "step": 5120 + }, + { + "epoch": 4.541832669322709, + "grad_norm": 0.8838174343109131, + "learning_rate": 0.0002, + "loss": 1.3501, + "step": 5130 + }, + { + "epoch": 4.550686144311642, + "grad_norm": 0.7081145644187927, + "learning_rate": 0.0002, + "loss": 1.3464, + "step": 5140 + }, + { + "epoch": 4.559539619300575, + "grad_norm": 0.7137823700904846, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 5150 + }, + { + "epoch": 4.568393094289509, + "grad_norm": 0.7890386581420898, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 5160 + }, + { + "epoch": 4.577246569278442, + "grad_norm": 0.6418015360832214, + "learning_rate": 0.0002, + "loss": 1.3368, + "step": 5170 + }, + { + "epoch": 4.586100044267375, + "grad_norm": 0.768373966217041, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 5180 + }, + { + "epoch": 4.5949535192563085, + "grad_norm": 0.6934067606925964, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5190 + }, + { + "epoch": 4.603806994245241, + "grad_norm": 0.9430719017982483, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 5200 + }, + { + "epoch": 4.612660469234174, + "grad_norm": 0.880264163017273, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 5210 + }, + { + "epoch": 4.621513944223108, + "grad_norm": 0.7584623098373413, + "learning_rate": 0.0002, + "loss": 1.3506, + "step": 5220 + }, + { + "epoch": 4.630367419212041, + "grad_norm": 0.7974506616592407, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 5230 + }, + { + "epoch": 4.639220894200974, + "grad_norm": 0.8812133073806763, + "learning_rate": 0.0002, + "loss": 1.3818, + "step": 5240 + }, + { + "epoch": 4.648074369189907, + "grad_norm": 0.8968724012374878, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 5250 + }, + { + "epoch": 4.65692784417884, + "grad_norm": 0.7317764759063721, + "learning_rate": 0.0002, + "loss": 1.3327, + "step": 5260 + }, + { + "epoch": 4.665781319167773, + "grad_norm": 0.7415484189987183, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 5270 + }, + { + "epoch": 4.674634794156707, + "grad_norm": 0.7867009043693542, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 5280 + }, + { + "epoch": 4.68348826914564, + "grad_norm": 0.6895416378974915, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 5290 + }, + { + "epoch": 4.6923417441345725, + "grad_norm": 0.7324506640434265, + "learning_rate": 0.0002, + "loss": 1.3438, + "step": 5300 + }, + { + "epoch": 4.701195219123506, + "grad_norm": 0.7383193969726562, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 5310 + }, + { + "epoch": 4.710048694112439, + "grad_norm": 0.8254916071891785, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 5320 + }, + { + "epoch": 4.718902169101372, + "grad_norm": 0.8161033987998962, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 5330 + }, + { + "epoch": 4.727755644090306, + "grad_norm": 0.7664386034011841, + "learning_rate": 0.0002, + "loss": 1.3623, + "step": 5340 + }, + { + "epoch": 4.7366091190792385, + "grad_norm": 0.7465475797653198, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 5350 + }, + { + "epoch": 4.745462594068171, + "grad_norm": 0.7810078263282776, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 5360 + }, + { + "epoch": 4.754316069057105, + "grad_norm": 0.7428439855575562, + "learning_rate": 0.0002, + "loss": 1.4489, + "step": 5370 + }, + { + "epoch": 4.763169544046038, + "grad_norm": 0.9548320174217224, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 5380 + }, + { + "epoch": 4.772023019034972, + "grad_norm": 0.7959533333778381, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 5390 + }, + { + "epoch": 4.780876494023905, + "grad_norm": 0.747473418712616, + "learning_rate": 0.0002, + "loss": 1.3448, + "step": 5400 + }, + { + "epoch": 4.789729969012837, + "grad_norm": 0.7863122820854187, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 5410 + }, + { + "epoch": 4.798583444001771, + "grad_norm": 0.7769626379013062, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 5420 + }, + { + "epoch": 4.807436918990704, + "grad_norm": 0.8551191091537476, + "learning_rate": 0.0002, + "loss": 1.4484, + "step": 5430 + }, + { + "epoch": 4.816290393979637, + "grad_norm": 0.8364850878715515, + "learning_rate": 0.0002, + "loss": 1.4314, + "step": 5440 + }, + { + "epoch": 4.825143868968571, + "grad_norm": 0.7458856701850891, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 5450 + }, + { + "epoch": 4.8339973439575035, + "grad_norm": 0.7558291554450989, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 5460 + }, + { + "epoch": 4.842850818946436, + "grad_norm": 0.8396534323692322, + "learning_rate": 0.0002, + "loss": 1.3343, + "step": 5470 + }, + { + "epoch": 4.85170429393537, + "grad_norm": 0.7790794968605042, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 5480 + }, + { + "epoch": 4.860557768924303, + "grad_norm": 0.8607641458511353, + "learning_rate": 0.0002, + "loss": 1.406, + "step": 5490 + }, + { + "epoch": 4.869411243913236, + "grad_norm": 0.828134298324585, + "learning_rate": 0.0002, + "loss": 1.4011, + "step": 5500 + }, + { + "epoch": 4.8782647189021695, + "grad_norm": 0.8783106803894043, + "learning_rate": 0.0002, + "loss": 1.4089, + "step": 5510 + }, + { + "epoch": 4.887118193891102, + "grad_norm": 0.7476183176040649, + "learning_rate": 0.0002, + "loss": 1.4565, + "step": 5520 + }, + { + "epoch": 4.895971668880035, + "grad_norm": 0.8023254871368408, + "learning_rate": 0.0002, + "loss": 1.3974, + "step": 5530 + }, + { + "epoch": 4.904825143868969, + "grad_norm": 0.8021706938743591, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 5540 + }, + { + "epoch": 4.913678618857902, + "grad_norm": 0.7873618602752686, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 5550 + }, + { + "epoch": 4.922532093846835, + "grad_norm": 0.7181428670883179, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 5560 + }, + { + "epoch": 4.931385568835768, + "grad_norm": 0.7464273571968079, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 5570 + }, + { + "epoch": 4.940239043824701, + "grad_norm": 0.7433671355247498, + "learning_rate": 0.0002, + "loss": 1.3184, + "step": 5580 + }, + { + "epoch": 4.949092518813634, + "grad_norm": 0.7571114301681519, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 5590 + }, + { + "epoch": 4.957945993802568, + "grad_norm": 0.7811630964279175, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 5600 + }, + { + "epoch": 4.966799468791501, + "grad_norm": 0.7609148621559143, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 5610 + }, + { + "epoch": 4.9756529437804335, + "grad_norm": 0.7324382066726685, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 5620 + }, + { + "epoch": 4.984506418769367, + "grad_norm": 0.9249559640884399, + "learning_rate": 0.0002, + "loss": 1.4557, + "step": 5630 + }, + { + "epoch": 4.9933598937583, + "grad_norm": 0.7852522134780884, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 5640 + }, + { + "epoch": 4.999557326250553, + "eval_loss": 1.9384633302688599, + "eval_runtime": 82.6042, + "eval_samples_per_second": 6.235, + "eval_steps_per_second": 0.787, + "step": 5647 + }, + { + "epoch": 5.002213368747233, + "grad_norm": 0.8052749037742615, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 5650 + }, + { + "epoch": 5.011066843736167, + "grad_norm": 1.380603551864624, + "learning_rate": 0.0002, + "loss": 1.1967, + "step": 5660 + }, + { + "epoch": 5.0199203187251, + "grad_norm": 0.9197829365730286, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 5670 + }, + { + "epoch": 5.028773793714032, + "grad_norm": 0.9338570833206177, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 5680 + }, + { + "epoch": 5.037627268702966, + "grad_norm": 1.0464060306549072, + "learning_rate": 0.0002, + "loss": 1.1866, + "step": 5690 + }, + { + "epoch": 5.046480743691899, + "grad_norm": 0.9055638909339905, + "learning_rate": 0.0002, + "loss": 1.2211, + "step": 5700 + }, + { + "epoch": 5.055334218680832, + "grad_norm": 0.9494627714157104, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 5710 + }, + { + "epoch": 5.064187693669766, + "grad_norm": 0.9680962562561035, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 5720 + }, + { + "epoch": 5.0730411686586985, + "grad_norm": 1.0254695415496826, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 5730 + }, + { + "epoch": 5.081894643647631, + "grad_norm": 0.9306758642196655, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 5740 + }, + { + "epoch": 5.090748118636565, + "grad_norm": 1.0620356798171997, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 5750 + }, + { + "epoch": 5.099601593625498, + "grad_norm": 1.0401700735092163, + "learning_rate": 0.0002, + "loss": 1.2628, + "step": 5760 + }, + { + "epoch": 5.108455068614431, + "grad_norm": 0.9916906952857971, + "learning_rate": 0.0002, + "loss": 1.1976, + "step": 5770 + }, + { + "epoch": 5.1173085436033645, + "grad_norm": 0.8387252688407898, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 5780 + }, + { + "epoch": 5.126162018592297, + "grad_norm": 0.9870850443840027, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 5790 + }, + { + "epoch": 5.13501549358123, + "grad_norm": 0.9204064011573792, + "learning_rate": 0.0002, + "loss": 1.1902, + "step": 5800 + }, + { + "epoch": 5.143868968570164, + "grad_norm": 0.9951931834220886, + "learning_rate": 0.0002, + "loss": 1.2266, + "step": 5810 + }, + { + "epoch": 5.152722443559097, + "grad_norm": 0.9745809435844421, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 5820 + }, + { + "epoch": 5.16157591854803, + "grad_norm": 0.9467785954475403, + "learning_rate": 0.0002, + "loss": 1.2549, + "step": 5830 + }, + { + "epoch": 5.170429393536963, + "grad_norm": 1.0451668500900269, + "learning_rate": 0.0002, + "loss": 1.2309, + "step": 5840 + }, + { + "epoch": 5.179282868525896, + "grad_norm": 0.9740142822265625, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 5850 + }, + { + "epoch": 5.18813634351483, + "grad_norm": 1.2158266305923462, + "learning_rate": 0.0002, + "loss": 1.2137, + "step": 5860 + }, + { + "epoch": 5.196989818503763, + "grad_norm": 1.0795036554336548, + "learning_rate": 0.0002, + "loss": 1.1631, + "step": 5870 + }, + { + "epoch": 5.205843293492696, + "grad_norm": 0.9578470587730408, + "learning_rate": 0.0002, + "loss": 1.1448, + "step": 5880 + }, + { + "epoch": 5.214696768481629, + "grad_norm": 0.8887509703636169, + "learning_rate": 0.0002, + "loss": 1.2183, + "step": 5890 + }, + { + "epoch": 5.223550243470562, + "grad_norm": 1.171006441116333, + "learning_rate": 0.0002, + "loss": 1.1991, + "step": 5900 + }, + { + "epoch": 5.232403718459495, + "grad_norm": 0.9016029834747314, + "learning_rate": 0.0002, + "loss": 1.1781, + "step": 5910 + }, + { + "epoch": 5.241257193448429, + "grad_norm": 1.173136830329895, + "learning_rate": 0.0002, + "loss": 1.2057, + "step": 5920 + }, + { + "epoch": 5.250110668437362, + "grad_norm": 0.8760318160057068, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 5930 + }, + { + "epoch": 5.258964143426295, + "grad_norm": 0.8998854160308838, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 5940 + }, + { + "epoch": 5.267817618415228, + "grad_norm": 1.017175316810608, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 5950 + }, + { + "epoch": 5.276671093404161, + "grad_norm": 0.8646609783172607, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 5960 + }, + { + "epoch": 5.285524568393094, + "grad_norm": 1.0030627250671387, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 5970 + }, + { + "epoch": 5.294378043382028, + "grad_norm": 0.975911557674408, + "learning_rate": 0.0002, + "loss": 1.2365, + "step": 5980 + }, + { + "epoch": 5.303231518370961, + "grad_norm": 0.9576130509376526, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 5990 + }, + { + "epoch": 5.3120849933598935, + "grad_norm": 0.9566167593002319, + "learning_rate": 0.0002, + "loss": 1.2681, + "step": 6000 + }, + { + "epoch": 5.320938468348827, + "grad_norm": 0.9200350642204285, + "learning_rate": 0.0002, + "loss": 1.2029, + "step": 6010 + }, + { + "epoch": 5.32979194333776, + "grad_norm": 1.0491118431091309, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 6020 + }, + { + "epoch": 5.338645418326693, + "grad_norm": 1.1199153661727905, + "learning_rate": 0.0002, + "loss": 1.2531, + "step": 6030 + }, + { + "epoch": 5.347498893315627, + "grad_norm": 1.015252947807312, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 6040 + }, + { + "epoch": 5.3563523683045595, + "grad_norm": 1.1076666116714478, + "learning_rate": 0.0002, + "loss": 1.2208, + "step": 6050 + }, + { + "epoch": 5.365205843293492, + "grad_norm": 0.9224653840065002, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 6060 + }, + { + "epoch": 5.374059318282426, + "grad_norm": 1.0079779624938965, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 6070 + }, + { + "epoch": 5.382912793271359, + "grad_norm": 0.9627894759178162, + "learning_rate": 0.0002, + "loss": 1.2612, + "step": 6080 + }, + { + "epoch": 5.391766268260292, + "grad_norm": 1.0503166913986206, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 6090 + }, + { + "epoch": 5.400619743249226, + "grad_norm": 0.912736713886261, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 6100 + }, + { + "epoch": 5.409473218238158, + "grad_norm": 1.2552032470703125, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 6110 + }, + { + "epoch": 5.418326693227091, + "grad_norm": 0.986230731010437, + "learning_rate": 0.0002, + "loss": 1.2738, + "step": 6120 + }, + { + "epoch": 5.427180168216025, + "grad_norm": 0.9869757294654846, + "learning_rate": 0.0002, + "loss": 1.3301, + "step": 6130 + }, + { + "epoch": 5.436033643204958, + "grad_norm": 1.012027621269226, + "learning_rate": 0.0002, + "loss": 1.241, + "step": 6140 + }, + { + "epoch": 5.444887118193891, + "grad_norm": 0.8855568170547485, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 6150 + }, + { + "epoch": 5.4537405931828244, + "grad_norm": 1.1522414684295654, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6160 + }, + { + "epoch": 5.462594068171757, + "grad_norm": 1.2448474168777466, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 6170 + }, + { + "epoch": 5.471447543160691, + "grad_norm": 1.0362223386764526, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 6180 + }, + { + "epoch": 5.480301018149624, + "grad_norm": 0.9363031983375549, + "learning_rate": 0.0002, + "loss": 1.2351, + "step": 6190 + }, + { + "epoch": 5.489154493138557, + "grad_norm": 0.8852020502090454, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 6200 + }, + { + "epoch": 5.4980079681274905, + "grad_norm": 0.8577062487602234, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 6210 + }, + { + "epoch": 5.506861443116423, + "grad_norm": 0.9351891875267029, + "learning_rate": 0.0002, + "loss": 1.2547, + "step": 6220 + }, + { + "epoch": 5.515714918105356, + "grad_norm": 1.0031992197036743, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 6230 + }, + { + "epoch": 5.52456839309429, + "grad_norm": 0.9935104250907898, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 6240 + }, + { + "epoch": 5.533421868083223, + "grad_norm": 1.1086243391036987, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 6250 + }, + { + "epoch": 5.542275343072156, + "grad_norm": 0.990772545337677, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 6260 + }, + { + "epoch": 5.551128818061089, + "grad_norm": 0.9317597150802612, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 6270 + }, + { + "epoch": 5.559982293050022, + "grad_norm": 0.9657552242279053, + "learning_rate": 0.0002, + "loss": 1.2095, + "step": 6280 + }, + { + "epoch": 5.568835768038955, + "grad_norm": 1.0271565914154053, + "learning_rate": 0.0002, + "loss": 1.2435, + "step": 6290 + }, + { + "epoch": 5.577689243027889, + "grad_norm": 0.916253924369812, + "learning_rate": 0.0002, + "loss": 1.2283, + "step": 6300 + }, + { + "epoch": 5.586542718016822, + "grad_norm": 1.0083940029144287, + "learning_rate": 0.0002, + "loss": 1.2648, + "step": 6310 + }, + { + "epoch": 5.5953961930057545, + "grad_norm": 0.9740358591079712, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 6320 + }, + { + "epoch": 5.604249667994688, + "grad_norm": 0.9645405411720276, + "learning_rate": 0.0002, + "loss": 1.2507, + "step": 6330 + }, + { + "epoch": 5.613103142983621, + "grad_norm": 0.9677100777626038, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 6340 + }, + { + "epoch": 5.621956617972554, + "grad_norm": 0.9706602692604065, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 6350 + }, + { + "epoch": 5.630810092961488, + "grad_norm": 1.1492316722869873, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 6360 + }, + { + "epoch": 5.639663567950421, + "grad_norm": 0.8857277035713196, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 6370 + }, + { + "epoch": 5.648517042939353, + "grad_norm": 1.0363037586212158, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 6380 + }, + { + "epoch": 5.657370517928287, + "grad_norm": 0.9621800780296326, + "learning_rate": 0.0002, + "loss": 1.1838, + "step": 6390 + }, + { + "epoch": 5.66622399291722, + "grad_norm": 0.9937820434570312, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 6400 + }, + { + "epoch": 5.675077467906153, + "grad_norm": 0.9491283297538757, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6410 + }, + { + "epoch": 5.683930942895087, + "grad_norm": 0.9429448246955872, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6420 + }, + { + "epoch": 5.6927844178840195, + "grad_norm": 0.9808844327926636, + "learning_rate": 0.0002, + "loss": 1.1663, + "step": 6430 + }, + { + "epoch": 5.701637892872952, + "grad_norm": 0.8191056847572327, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 6440 + }, + { + "epoch": 5.710491367861886, + "grad_norm": 1.1118974685668945, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 6450 + }, + { + "epoch": 5.719344842850819, + "grad_norm": 0.9030969142913818, + "learning_rate": 0.0002, + "loss": 1.2192, + "step": 6460 + }, + { + "epoch": 5.728198317839752, + "grad_norm": 1.0509997606277466, + "learning_rate": 0.0002, + "loss": 1.301, + "step": 6470 + }, + { + "epoch": 5.7370517928286855, + "grad_norm": 1.0369981527328491, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 6480 + }, + { + "epoch": 5.745905267817618, + "grad_norm": 0.8626071214675903, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 6490 + }, + { + "epoch": 5.754758742806551, + "grad_norm": 1.0448849201202393, + "learning_rate": 0.0002, + "loss": 1.2446, + "step": 6500 + }, + { + "epoch": 5.763612217795485, + "grad_norm": 0.9333119988441467, + "learning_rate": 0.0002, + "loss": 1.2698, + "step": 6510 + }, + { + "epoch": 5.772465692784418, + "grad_norm": 0.8533532023429871, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 6520 + }, + { + "epoch": 5.781319167773351, + "grad_norm": 0.9774261713027954, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6530 + }, + { + "epoch": 5.790172642762284, + "grad_norm": 0.9841071963310242, + "learning_rate": 0.0002, + "loss": 1.2031, + "step": 6540 + }, + { + "epoch": 5.799026117751217, + "grad_norm": 0.9891805052757263, + "learning_rate": 0.0002, + "loss": 1.2767, + "step": 6550 + }, + { + "epoch": 5.80787959274015, + "grad_norm": 0.9633952379226685, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 6560 + }, + { + "epoch": 5.816733067729084, + "grad_norm": 1.327634334564209, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 6570 + }, + { + "epoch": 5.825586542718017, + "grad_norm": 0.9805197715759277, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 6580 + }, + { + "epoch": 5.8344400177069495, + "grad_norm": 1.020957589149475, + "learning_rate": 0.0002, + "loss": 1.1933, + "step": 6590 + }, + { + "epoch": 5.843293492695883, + "grad_norm": 0.9694032669067383, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6600 + }, + { + "epoch": 5.852146967684816, + "grad_norm": 0.8980914354324341, + "learning_rate": 0.0002, + "loss": 1.2671, + "step": 6610 + }, + { + "epoch": 5.861000442673749, + "grad_norm": 0.8312330842018127, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 6620 + }, + { + "epoch": 5.869853917662683, + "grad_norm": 0.9773725271224976, + "learning_rate": 0.0002, + "loss": 1.3301, + "step": 6630 + }, + { + "epoch": 5.878707392651616, + "grad_norm": 0.9684233665466309, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 6640 + }, + { + "epoch": 5.887560867640548, + "grad_norm": 0.8436519503593445, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 6650 + }, + { + "epoch": 5.896414342629482, + "grad_norm": 0.9129888415336609, + "learning_rate": 0.0002, + "loss": 1.2213, + "step": 6660 + }, + { + "epoch": 5.905267817618415, + "grad_norm": 0.8871369957923889, + "learning_rate": 0.0002, + "loss": 1.3272, + "step": 6670 + }, + { + "epoch": 5.914121292607349, + "grad_norm": 0.9544420838356018, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 6680 + }, + { + "epoch": 5.922974767596282, + "grad_norm": 0.9607448577880859, + "learning_rate": 0.0002, + "loss": 1.2954, + "step": 6690 + }, + { + "epoch": 5.9318282425852145, + "grad_norm": 0.9675708413124084, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 6700 + }, + { + "epoch": 5.940681717574148, + "grad_norm": 0.9373534321784973, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 6710 + }, + { + "epoch": 5.949535192563081, + "grad_norm": 0.9750351905822754, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 6720 + }, + { + "epoch": 5.958388667552014, + "grad_norm": 0.9122727513313293, + "learning_rate": 0.0002, + "loss": 1.2575, + "step": 6730 + }, + { + "epoch": 5.967242142540948, + "grad_norm": 0.9300726652145386, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 6740 + }, + { + "epoch": 5.9760956175298805, + "grad_norm": 0.972944438457489, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 6750 + }, + { + "epoch": 5.984949092518813, + "grad_norm": 1.2385832071304321, + "learning_rate": 0.0002, + "loss": 1.3252, + "step": 6760 + }, + { + "epoch": 5.993802567507747, + "grad_norm": 0.9080338478088379, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 6770 + }, + { + "epoch": 6.0, + "eval_loss": 2.062002658843994, + "eval_runtime": 83.2814, + "eval_samples_per_second": 6.184, + "eval_steps_per_second": 0.78, + "step": 6777 + }, + { + "epoch": 6.00265604249668, + "grad_norm": 0.8741096258163452, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 6780 + }, + { + "epoch": 6.011509517485613, + "grad_norm": 1.2510347366333008, + "learning_rate": 0.0002, + "loss": 1.1242, + "step": 6790 + }, + { + "epoch": 6.0203629924745465, + "grad_norm": 1.063910722732544, + "learning_rate": 0.0002, + "loss": 1.0269, + "step": 6800 + }, + { + "epoch": 6.029216467463479, + "grad_norm": 1.169573187828064, + "learning_rate": 0.0002, + "loss": 1.0468, + "step": 6810 + }, + { + "epoch": 6.038069942452412, + "grad_norm": 1.0453242063522339, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 6820 + }, + { + "epoch": 6.046923417441346, + "grad_norm": 1.1960445642471313, + "learning_rate": 0.0002, + "loss": 1.0469, + "step": 6830 + }, + { + "epoch": 6.055776892430279, + "grad_norm": 0.9427650570869446, + "learning_rate": 0.0002, + "loss": 1.1233, + "step": 6840 + }, + { + "epoch": 6.064630367419212, + "grad_norm": 1.2107350826263428, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 6850 + }, + { + "epoch": 6.073483842408145, + "grad_norm": 1.262130856513977, + "learning_rate": 0.0002, + "loss": 1.0751, + "step": 6860 + }, + { + "epoch": 6.082337317397078, + "grad_norm": 1.1628082990646362, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 6870 + }, + { + "epoch": 6.091190792386011, + "grad_norm": 1.0090514421463013, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 6880 + }, + { + "epoch": 6.100044267374945, + "grad_norm": 1.5029802322387695, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 6890 + }, + { + "epoch": 6.108897742363878, + "grad_norm": 1.0522133111953735, + "learning_rate": 0.0002, + "loss": 1.0549, + "step": 6900 + }, + { + "epoch": 6.117751217352811, + "grad_norm": 1.225534439086914, + "learning_rate": 0.0002, + "loss": 1.0502, + "step": 6910 + }, + { + "epoch": 6.126604692341744, + "grad_norm": 1.2859058380126953, + "learning_rate": 0.0002, + "loss": 1.0808, + "step": 6920 + }, + { + "epoch": 6.135458167330677, + "grad_norm": 1.215205192565918, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 6930 + }, + { + "epoch": 6.14431164231961, + "grad_norm": 1.1799274682998657, + "learning_rate": 0.0002, + "loss": 1.1442, + "step": 6940 + }, + { + "epoch": 6.153165117308544, + "grad_norm": 1.2553550004959106, + "learning_rate": 0.0002, + "loss": 1.0749, + "step": 6950 + }, + { + "epoch": 6.162018592297477, + "grad_norm": 1.2171931266784668, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 6960 + }, + { + "epoch": 6.17087206728641, + "grad_norm": 1.1896923780441284, + "learning_rate": 0.0002, + "loss": 1.0579, + "step": 6970 + }, + { + "epoch": 6.179725542275343, + "grad_norm": 1.007250189781189, + "learning_rate": 0.0002, + "loss": 1.1477, + "step": 6980 + }, + { + "epoch": 6.188579017264276, + "grad_norm": 1.2109580039978027, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 6990 + }, + { + "epoch": 6.19743249225321, + "grad_norm": 1.2197009325027466, + "learning_rate": 0.0002, + "loss": 1.0809, + "step": 7000 + }, + { + "epoch": 6.206285967242143, + "grad_norm": 1.1417629718780518, + "learning_rate": 0.0002, + "loss": 1.1322, + "step": 7010 + }, + { + "epoch": 6.2151394422310755, + "grad_norm": 1.2337356805801392, + "learning_rate": 0.0002, + "loss": 1.0541, + "step": 7020 + }, + { + "epoch": 6.223992917220009, + "grad_norm": 1.1230454444885254, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 7030 + }, + { + "epoch": 6.232846392208942, + "grad_norm": 1.0634387731552124, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 7040 + }, + { + "epoch": 6.241699867197875, + "grad_norm": 1.1566855907440186, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 7050 + }, + { + "epoch": 6.250553342186809, + "grad_norm": 1.2251075506210327, + "learning_rate": 0.0002, + "loss": 1.063, + "step": 7060 + }, + { + "epoch": 6.2594068171757415, + "grad_norm": 1.2232472896575928, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 7070 + }, + { + "epoch": 6.268260292164674, + "grad_norm": 1.1014091968536377, + "learning_rate": 0.0002, + "loss": 1.0394, + "step": 7080 + }, + { + "epoch": 6.277113767153608, + "grad_norm": 1.322811245918274, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 7090 + }, + { + "epoch": 6.285967242142541, + "grad_norm": 0.9820072650909424, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 7100 + }, + { + "epoch": 6.294820717131474, + "grad_norm": 1.13047456741333, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 7110 + }, + { + "epoch": 6.303674192120408, + "grad_norm": 1.145127534866333, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 7120 + }, + { + "epoch": 6.31252766710934, + "grad_norm": 1.101465106010437, + "learning_rate": 0.0002, + "loss": 1.089, + "step": 7130 + }, + { + "epoch": 6.321381142098273, + "grad_norm": 1.131705641746521, + "learning_rate": 0.0002, + "loss": 1.1122, + "step": 7140 + }, + { + "epoch": 6.330234617087207, + "grad_norm": 0.9876824617385864, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 7150 + }, + { + "epoch": 6.33908809207614, + "grad_norm": 1.2950096130371094, + "learning_rate": 0.0002, + "loss": 1.0184, + "step": 7160 + }, + { + "epoch": 6.347941567065073, + "grad_norm": 1.0496132373809814, + "learning_rate": 0.0002, + "loss": 1.0559, + "step": 7170 + }, + { + "epoch": 6.3567950420540065, + "grad_norm": 1.3835711479187012, + "learning_rate": 0.0002, + "loss": 1.1334, + "step": 7180 + }, + { + "epoch": 6.365648517042939, + "grad_norm": 1.176424503326416, + "learning_rate": 0.0002, + "loss": 0.9777, + "step": 7190 + }, + { + "epoch": 6.374501992031872, + "grad_norm": 1.3502846956253052, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 7200 + }, + { + "epoch": 6.383355467020806, + "grad_norm": 1.2429769039154053, + "learning_rate": 0.0002, + "loss": 1.0614, + "step": 7210 + }, + { + "epoch": 6.392208942009739, + "grad_norm": 1.138015866279602, + "learning_rate": 0.0002, + "loss": 1.1712, + "step": 7220 + }, + { + "epoch": 6.401062416998672, + "grad_norm": 1.4407539367675781, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 7230 + }, + { + "epoch": 6.409915891987605, + "grad_norm": 1.1464104652404785, + "learning_rate": 0.0002, + "loss": 1.1595, + "step": 7240 + }, + { + "epoch": 6.418769366976538, + "grad_norm": 1.2028888463974, + "learning_rate": 0.0002, + "loss": 1.1381, + "step": 7250 + }, + { + "epoch": 6.427622841965471, + "grad_norm": 1.132938027381897, + "learning_rate": 0.0002, + "loss": 1.1129, + "step": 7260 + }, + { + "epoch": 6.436476316954405, + "grad_norm": 1.2005301713943481, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 7270 + }, + { + "epoch": 6.445329791943338, + "grad_norm": 1.0460501909255981, + "learning_rate": 0.0002, + "loss": 1.0538, + "step": 7280 + }, + { + "epoch": 6.4541832669322705, + "grad_norm": 1.1363240480422974, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 7290 + }, + { + "epoch": 6.463036741921204, + "grad_norm": 1.0439460277557373, + "learning_rate": 0.0002, + "loss": 1.1042, + "step": 7300 + }, + { + "epoch": 6.471890216910137, + "grad_norm": 1.1968905925750732, + "learning_rate": 0.0002, + "loss": 1.0896, + "step": 7310 + }, + { + "epoch": 6.48074369189907, + "grad_norm": 1.0443525314331055, + "learning_rate": 0.0002, + "loss": 1.0891, + "step": 7320 + }, + { + "epoch": 6.489597166888004, + "grad_norm": 1.2550246715545654, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 7330 + }, + { + "epoch": 6.4984506418769366, + "grad_norm": 1.2880409955978394, + "learning_rate": 0.0002, + "loss": 1.2028, + "step": 7340 + }, + { + "epoch": 6.507304116865869, + "grad_norm": 1.2390265464782715, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 7350 + }, + { + "epoch": 6.516157591854803, + "grad_norm": 1.0650159120559692, + "learning_rate": 0.0002, + "loss": 1.065, + "step": 7360 + }, + { + "epoch": 6.525011066843736, + "grad_norm": 1.4934154748916626, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 7370 + }, + { + "epoch": 6.533864541832669, + "grad_norm": 1.0902682542800903, + "learning_rate": 0.0002, + "loss": 1.0436, + "step": 7380 + }, + { + "epoch": 6.542718016821603, + "grad_norm": 1.1561789512634277, + "learning_rate": 0.0002, + "loss": 1.145, + "step": 7390 + }, + { + "epoch": 6.551571491810535, + "grad_norm": 1.1010485887527466, + "learning_rate": 0.0002, + "loss": 1.1633, + "step": 7400 + }, + { + "epoch": 6.560424966799468, + "grad_norm": 1.1616493463516235, + "learning_rate": 0.0002, + "loss": 1.1063, + "step": 7410 + }, + { + "epoch": 6.569278441788402, + "grad_norm": 1.2321627140045166, + "learning_rate": 0.0002, + "loss": 1.1217, + "step": 7420 + }, + { + "epoch": 6.578131916777335, + "grad_norm": 1.162299394607544, + "learning_rate": 0.0002, + "loss": 1.135, + "step": 7430 + }, + { + "epoch": 6.586985391766269, + "grad_norm": 0.9935213923454285, + "learning_rate": 0.0002, + "loss": 1.1785, + "step": 7440 + }, + { + "epoch": 6.5958388667552015, + "grad_norm": 1.3035451173782349, + "learning_rate": 0.0002, + "loss": 1.078, + "step": 7450 + }, + { + "epoch": 6.604692341744134, + "grad_norm": 1.0957173109054565, + "learning_rate": 0.0002, + "loss": 1.1377, + "step": 7460 + }, + { + "epoch": 6.613545816733068, + "grad_norm": 1.166472315788269, + "learning_rate": 0.0002, + "loss": 1.1882, + "step": 7470 + }, + { + "epoch": 6.622399291722001, + "grad_norm": 1.332716464996338, + "learning_rate": 0.0002, + "loss": 1.1379, + "step": 7480 + }, + { + "epoch": 6.631252766710934, + "grad_norm": 1.1008102893829346, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 7490 + }, + { + "epoch": 6.6401062416998675, + "grad_norm": 1.4472310543060303, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 7500 + }, + { + "epoch": 6.6489597166888, + "grad_norm": 1.1247508525848389, + "learning_rate": 0.0002, + "loss": 1.1729, + "step": 7510 + }, + { + "epoch": 6.657813191677733, + "grad_norm": 1.297936201095581, + "learning_rate": 0.0002, + "loss": 1.1649, + "step": 7520 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.0784718990325928, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 7530 + }, + { + "epoch": 6.6755201416556, + "grad_norm": 1.1518864631652832, + "learning_rate": 0.0002, + "loss": 1.0852, + "step": 7540 + }, + { + "epoch": 6.684373616644533, + "grad_norm": 1.1135684251785278, + "learning_rate": 0.0002, + "loss": 1.1611, + "step": 7550 + }, + { + "epoch": 6.693227091633466, + "grad_norm": 1.0792579650878906, + "learning_rate": 0.0002, + "loss": 1.1257, + "step": 7560 + }, + { + "epoch": 6.702080566622399, + "grad_norm": 1.1826539039611816, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 7570 + }, + { + "epoch": 6.710934041611332, + "grad_norm": 1.1485552787780762, + "learning_rate": 0.0002, + "loss": 1.0874, + "step": 7580 + }, + { + "epoch": 6.719787516600266, + "grad_norm": 1.090723991394043, + "learning_rate": 0.0002, + "loss": 1.0502, + "step": 7590 + }, + { + "epoch": 6.728640991589199, + "grad_norm": 1.105883002281189, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 7600 + }, + { + "epoch": 6.737494466578132, + "grad_norm": 1.3093862533569336, + "learning_rate": 0.0002, + "loss": 1.1101, + "step": 7610 + }, + { + "epoch": 6.746347941567065, + "grad_norm": 1.0273808240890503, + "learning_rate": 0.0002, + "loss": 1.1202, + "step": 7620 + }, + { + "epoch": 6.755201416555998, + "grad_norm": 1.3253363370895386, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 7630 + }, + { + "epoch": 6.764054891544931, + "grad_norm": 1.1979365348815918, + "learning_rate": 0.0002, + "loss": 1.0833, + "step": 7640 + }, + { + "epoch": 6.772908366533865, + "grad_norm": 1.123506784439087, + "learning_rate": 0.0002, + "loss": 1.1208, + "step": 7650 + }, + { + "epoch": 6.781761841522798, + "grad_norm": 1.3928422927856445, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 7660 + }, + { + "epoch": 6.790615316511731, + "grad_norm": 1.1540825366973877, + "learning_rate": 0.0002, + "loss": 1.1535, + "step": 7670 + }, + { + "epoch": 6.799468791500664, + "grad_norm": 1.0836732387542725, + "learning_rate": 0.0002, + "loss": 1.1053, + "step": 7680 + }, + { + "epoch": 6.808322266489597, + "grad_norm": 1.0360240936279297, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 7690 + }, + { + "epoch": 6.817175741478531, + "grad_norm": 1.2440129518508911, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 7700 + }, + { + "epoch": 6.826029216467464, + "grad_norm": 1.1702594757080078, + "learning_rate": 0.0002, + "loss": 1.1245, + "step": 7710 + }, + { + "epoch": 6.8348826914563965, + "grad_norm": 1.0726280212402344, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 7720 + }, + { + "epoch": 6.84373616644533, + "grad_norm": 0.9410907030105591, + "learning_rate": 0.0002, + "loss": 1.1471, + "step": 7730 + }, + { + "epoch": 6.852589641434263, + "grad_norm": 1.042914867401123, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 7740 + }, + { + "epoch": 6.861443116423196, + "grad_norm": 1.1028170585632324, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 7750 + }, + { + "epoch": 6.87029659141213, + "grad_norm": 1.0990355014801025, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 7760 + }, + { + "epoch": 6.8791500664010625, + "grad_norm": 1.2572479248046875, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 7770 + }, + { + "epoch": 6.888003541389995, + "grad_norm": 1.250198483467102, + "learning_rate": 0.0002, + "loss": 1.1749, + "step": 7780 + }, + { + "epoch": 6.896857016378929, + "grad_norm": 1.1872532367706299, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 7790 + }, + { + "epoch": 6.905710491367862, + "grad_norm": 1.5275602340698242, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 7800 + }, + { + "epoch": 6.914563966356795, + "grad_norm": 1.015166163444519, + "learning_rate": 0.0002, + "loss": 1.0712, + "step": 7810 + }, + { + "epoch": 6.923417441345729, + "grad_norm": 1.3205344676971436, + "learning_rate": 0.0002, + "loss": 1.1931, + "step": 7820 + }, + { + "epoch": 6.932270916334661, + "grad_norm": 1.1329596042633057, + "learning_rate": 0.0002, + "loss": 1.222, + "step": 7830 + }, + { + "epoch": 6.941124391323594, + "grad_norm": 1.1614333391189575, + "learning_rate": 0.0002, + "loss": 1.1207, + "step": 7840 + }, + { + "epoch": 6.949977866312528, + "grad_norm": 1.3472208976745605, + "learning_rate": 0.0002, + "loss": 1.2127, + "step": 7850 + }, + { + "epoch": 6.958831341301461, + "grad_norm": 1.1490193605422974, + "learning_rate": 0.0002, + "loss": 1.1002, + "step": 7860 + }, + { + "epoch": 6.967684816290394, + "grad_norm": 1.1343097686767578, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 7870 + }, + { + "epoch": 6.9765382912793275, + "grad_norm": 1.2555341720581055, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 7880 + }, + { + "epoch": 6.98539176626826, + "grad_norm": 1.2695735692977905, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 7890 + }, + { + "epoch": 6.994245241257193, + "grad_norm": 1.1662464141845703, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 7900 + }, + { + "epoch": 6.999557326250553, + "eval_loss": 2.148611068725586, + "eval_runtime": 82.53, + "eval_samples_per_second": 6.24, + "eval_steps_per_second": 0.788, + "step": 7906 + } + ], + "logging_steps": 10, + "max_steps": 9032, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.658951166294753e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-7906/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7a4ae8b889baf459a988f633b5d65ab78b0de5b6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dce268f6e51f6dc870ed28d591d48fabd04ce2c4664679f25a891808953eee8 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d48424a97cc11f632af4d65944730c6ac36c3c7c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a13d8c62859d9598d3bbd429309429a4ad27d212aa4f09eecda99aed2416cc7 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fdcaad87384feba3f9255bd4192c2ff9184a01f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8ce2c1a542de171db8064f2f7d9df03662d9811b53ece1221d27b8566a254a0 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e5b7e33df81d946442a6d88dc264bf7a28a3390 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79a3fcee3e6dd4df57ef36036257199225287682979181fd6f84094a94a03720 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bef6b1148c379e3f777ff0d4231cfb2a367cfff0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/trainer_state.json @@ -0,0 +1,6418 @@ +{ + "best_metric": 1.8077166080474854, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", + "epoch": 7.996458610004427, + "eval_steps": 10, + "global_step": 9032, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008853474988933156, + "grad_norm": 0.4775333106517792, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 10 + }, + { + "epoch": 0.017706949977866312, + "grad_norm": 0.5485824346542358, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 20 + }, + { + "epoch": 0.02656042496679947, + "grad_norm": 0.5675218105316162, + "learning_rate": 0.0002, + "loss": 2.0984, + "step": 30 + }, + { + "epoch": 0.035413899955732624, + "grad_norm": 0.696494460105896, + "learning_rate": 0.0002, + "loss": 2.0622, + "step": 40 + }, + { + "epoch": 0.04426737494466578, + "grad_norm": 0.4788398742675781, + "learning_rate": 0.0002, + "loss": 1.9547, + "step": 50 + }, + { + "epoch": 0.05312084993359894, + "grad_norm": 0.4763128161430359, + "learning_rate": 0.0002, + "loss": 1.8722, + "step": 60 + }, + { + "epoch": 0.0619743249225321, + "grad_norm": 0.5929698348045349, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 70 + }, + { + "epoch": 0.07082779991146525, + "grad_norm": 0.5899396538734436, + "learning_rate": 0.0002, + "loss": 1.9573, + "step": 80 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.460123747587204, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 90 + }, + { + "epoch": 0.08853474988933156, + "grad_norm": 0.4184812009334564, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 100 + }, + { + "epoch": 0.09738822487826472, + "grad_norm": 0.4051891267299652, + "learning_rate": 0.0002, + "loss": 1.8079, + "step": 110 + }, + { + "epoch": 0.10624169986719788, + "grad_norm": 0.3709661066532135, + "learning_rate": 0.0002, + "loss": 1.8911, + "step": 120 + }, + { + "epoch": 0.11509517485613104, + "grad_norm": 0.4783487915992737, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 130 + }, + { + "epoch": 0.1239486498450642, + "grad_norm": 0.36478137969970703, + "learning_rate": 0.0002, + "loss": 1.8602, + "step": 140 + }, + { + "epoch": 0.13280212483399734, + "grad_norm": 0.4005294442176819, + "learning_rate": 0.0002, + "loss": 1.7814, + "step": 150 + }, + { + "epoch": 0.1416555998229305, + "grad_norm": 0.42357513308525085, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 160 + }, + { + "epoch": 0.15050907481186365, + "grad_norm": 0.3913971781730652, + "learning_rate": 0.0002, + "loss": 1.8835, + "step": 170 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.4650019407272339, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 180 + }, + { + "epoch": 0.16821602478972997, + "grad_norm": 0.5545958876609802, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 190 + }, + { + "epoch": 0.17706949977866313, + "grad_norm": 0.3669356107711792, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 200 + }, + { + "epoch": 0.18592297476759628, + "grad_norm": 0.3683622181415558, + "learning_rate": 0.0002, + "loss": 1.8169, + "step": 210 + }, + { + "epoch": 0.19477644975652944, + "grad_norm": 0.39825671911239624, + "learning_rate": 0.0002, + "loss": 1.8117, + "step": 220 + }, + { + "epoch": 0.2036299247454626, + "grad_norm": 0.4298318326473236, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 230 + }, + { + "epoch": 0.21248339973439576, + "grad_norm": 0.36111244559288025, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 240 + }, + { + "epoch": 0.2213368747233289, + "grad_norm": 0.3711858093738556, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 250 + }, + { + "epoch": 0.23019034971226207, + "grad_norm": 0.37717559933662415, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 260 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.3678877651691437, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 270 + }, + { + "epoch": 0.2478972996901284, + "grad_norm": 0.4165912866592407, + "learning_rate": 0.0002, + "loss": 1.8235, + "step": 280 + }, + { + "epoch": 0.25675077467906154, + "grad_norm": 0.3403240740299225, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 290 + }, + { + "epoch": 0.2656042496679947, + "grad_norm": 0.4023234248161316, + "learning_rate": 0.0002, + "loss": 1.8704, + "step": 300 + }, + { + "epoch": 0.27445772465692786, + "grad_norm": 0.32472360134124756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 310 + }, + { + "epoch": 0.283311199645861, + "grad_norm": 0.36464595794677734, + "learning_rate": 0.0002, + "loss": 1.8544, + "step": 320 + }, + { + "epoch": 0.2921646746347942, + "grad_norm": 0.3868598937988281, + "learning_rate": 0.0002, + "loss": 1.8168, + "step": 330 + }, + { + "epoch": 0.3010181496237273, + "grad_norm": 0.3123539686203003, + "learning_rate": 0.0002, + "loss": 1.772, + "step": 340 + }, + { + "epoch": 0.3098716246126605, + "grad_norm": 0.3392639458179474, + "learning_rate": 0.0002, + "loss": 1.8285, + "step": 350 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.42070651054382324, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 360 + }, + { + "epoch": 0.3275785745905268, + "grad_norm": 0.3650900423526764, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 370 + }, + { + "epoch": 0.33643204957945994, + "grad_norm": 0.41388973593711853, + "learning_rate": 0.0002, + "loss": 1.8388, + "step": 380 + }, + { + "epoch": 0.3452855245683931, + "grad_norm": 0.36625272035598755, + "learning_rate": 0.0002, + "loss": 1.79, + "step": 390 + }, + { + "epoch": 0.35413899955732625, + "grad_norm": 0.3930284082889557, + "learning_rate": 0.0002, + "loss": 1.8271, + "step": 400 + }, + { + "epoch": 0.3629924745462594, + "grad_norm": 0.3415820300579071, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 410 + }, + { + "epoch": 0.37184594953519257, + "grad_norm": 0.4256570041179657, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 420 + }, + { + "epoch": 0.3806994245241257, + "grad_norm": 0.3740842938423157, + "learning_rate": 0.0002, + "loss": 1.7728, + "step": 430 + }, + { + "epoch": 0.3895528995130589, + "grad_norm": 0.334108829498291, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 440 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.33186739683151245, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 450 + }, + { + "epoch": 0.4072598494909252, + "grad_norm": 0.39127954840660095, + "learning_rate": 0.0002, + "loss": 1.8885, + "step": 460 + }, + { + "epoch": 0.4161133244798583, + "grad_norm": 0.331443727016449, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 470 + }, + { + "epoch": 0.4249667994687915, + "grad_norm": 0.36834150552749634, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 480 + }, + { + "epoch": 0.43382027445772464, + "grad_norm": 0.338123619556427, + "learning_rate": 0.0002, + "loss": 1.7549, + "step": 490 + }, + { + "epoch": 0.4426737494466578, + "grad_norm": 0.3891060948371887, + "learning_rate": 0.0002, + "loss": 1.795, + "step": 500 + }, + { + "epoch": 0.45152722443559096, + "grad_norm": 0.3486529290676117, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 510 + }, + { + "epoch": 0.46038069942452414, + "grad_norm": 0.3635135889053345, + "learning_rate": 0.0002, + "loss": 1.796, + "step": 520 + }, + { + "epoch": 0.4692341744134573, + "grad_norm": 0.7706693410873413, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 530 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.33725443482398987, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 540 + }, + { + "epoch": 0.4869411243913236, + "grad_norm": 0.3127504289150238, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 550 + }, + { + "epoch": 0.4957945993802568, + "grad_norm": 0.3527977466583252, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 560 + }, + { + "epoch": 0.5046480743691899, + "grad_norm": 0.3574548661708832, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 570 + }, + { + "epoch": 0.5135015493581231, + "grad_norm": 0.32787248492240906, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 580 + }, + { + "epoch": 0.5223550243470563, + "grad_norm": 0.3309430778026581, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 590 + }, + { + "epoch": 0.5312084993359893, + "grad_norm": 0.34276407957077026, + "learning_rate": 0.0002, + "loss": 1.7798, + "step": 600 + }, + { + "epoch": 0.5400619743249225, + "grad_norm": 0.3343711495399475, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 610 + }, + { + "epoch": 0.5489154493138557, + "grad_norm": 0.3193040192127228, + "learning_rate": 0.0002, + "loss": 1.7661, + "step": 620 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.3059828579425812, + "learning_rate": 0.0002, + "loss": 1.7769, + "step": 630 + }, + { + "epoch": 0.566622399291722, + "grad_norm": 0.37237173318862915, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 640 + }, + { + "epoch": 0.5754758742806552, + "grad_norm": 0.36022549867630005, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 650 + }, + { + "epoch": 0.5843293492695883, + "grad_norm": 0.34974920749664307, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 660 + }, + { + "epoch": 0.5931828242585214, + "grad_norm": 0.37135401368141174, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 670 + }, + { + "epoch": 0.6020362992474546, + "grad_norm": 0.3385699689388275, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 680 + }, + { + "epoch": 0.6108897742363878, + "grad_norm": 0.36015814542770386, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 690 + }, + { + "epoch": 0.619743249225321, + "grad_norm": 0.3503795564174652, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 700 + }, + { + "epoch": 0.628596724214254, + "grad_norm": 0.3447190225124359, + "learning_rate": 0.0002, + "loss": 1.7733, + "step": 710 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.3193499445915222, + "learning_rate": 0.0002, + "loss": 1.794, + "step": 720 + }, + { + "epoch": 0.6463036741921204, + "grad_norm": 0.37058180570602417, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 730 + }, + { + "epoch": 0.6551571491810536, + "grad_norm": 0.42216411232948303, + "learning_rate": 0.0002, + "loss": 1.8391, + "step": 740 + }, + { + "epoch": 0.6640106241699867, + "grad_norm": 0.3091185688972473, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 750 + }, + { + "epoch": 0.6728640991589199, + "grad_norm": 0.33168601989746094, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 760 + }, + { + "epoch": 0.6817175741478531, + "grad_norm": 0.31269341707229614, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 770 + }, + { + "epoch": 0.6905710491367862, + "grad_norm": 0.36125293374061584, + "learning_rate": 0.0002, + "loss": 1.8526, + "step": 780 + }, + { + "epoch": 0.6994245241257193, + "grad_norm": 0.3145293593406677, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 790 + }, + { + "epoch": 0.7082779991146525, + "grad_norm": 0.3611990809440613, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 800 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.3165971636772156, + "learning_rate": 0.0002, + "loss": 1.892, + "step": 810 + }, + { + "epoch": 0.7259849490925188, + "grad_norm": 0.3364323675632477, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 820 + }, + { + "epoch": 0.734838424081452, + "grad_norm": 0.4310600757598877, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 830 + }, + { + "epoch": 0.7436918990703851, + "grad_norm": 0.3414389491081238, + "learning_rate": 0.0002, + "loss": 1.7816, + "step": 840 + }, + { + "epoch": 0.7525453740593183, + "grad_norm": 0.35536202788352966, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 850 + }, + { + "epoch": 0.7613988490482514, + "grad_norm": 0.3232460618019104, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 860 + }, + { + "epoch": 0.7702523240371846, + "grad_norm": 0.32734858989715576, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 870 + }, + { + "epoch": 0.7791057990261178, + "grad_norm": 0.3433493673801422, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 880 + }, + { + "epoch": 0.787959274015051, + "grad_norm": 0.33354780077934265, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 890 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.30728545784950256, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 900 + }, + { + "epoch": 0.8056662239929172, + "grad_norm": 0.3373030126094818, + "learning_rate": 0.0002, + "loss": 1.8267, + "step": 910 + }, + { + "epoch": 0.8145196989818504, + "grad_norm": 0.3468782603740692, + "learning_rate": 0.0002, + "loss": 1.8479, + "step": 920 + }, + { + "epoch": 0.8233731739707836, + "grad_norm": 0.33520200848579407, + "learning_rate": 0.0002, + "loss": 1.8548, + "step": 930 + }, + { + "epoch": 0.8322266489597167, + "grad_norm": 0.35207098722457886, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 0.8410801239486498, + "grad_norm": 0.4000207483768463, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 950 + }, + { + "epoch": 0.849933598937583, + "grad_norm": 0.35362836718559265, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 960 + }, + { + "epoch": 0.8587870739265162, + "grad_norm": 0.3470745086669922, + "learning_rate": 0.0002, + "loss": 1.7497, + "step": 970 + }, + { + "epoch": 0.8676405489154493, + "grad_norm": 0.31602704524993896, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 980 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.3062942326068878, + "learning_rate": 0.0002, + "loss": 1.7734, + "step": 990 + }, + { + "epoch": 0.8853474988933157, + "grad_norm": 0.36963850259780884, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1000 + }, + { + "epoch": 0.8942009738822487, + "grad_norm": 0.3384034037590027, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1010 + }, + { + "epoch": 0.9030544488711819, + "grad_norm": 0.30436110496520996, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 1020 + }, + { + "epoch": 0.9119079238601151, + "grad_norm": 3.499784469604492, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1030 + }, + { + "epoch": 0.9207613988490483, + "grad_norm": 0.3130280375480652, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1040 + }, + { + "epoch": 0.9296148738379814, + "grad_norm": 0.29976674914360046, + "learning_rate": 0.0002, + "loss": 1.7527, + "step": 1050 + }, + { + "epoch": 0.9384683488269145, + "grad_norm": 0.35852617025375366, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 1060 + }, + { + "epoch": 0.9473218238158477, + "grad_norm": 0.3288591504096985, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1070 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.32641634345054626, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1080 + }, + { + "epoch": 0.965028773793714, + "grad_norm": 0.3305715322494507, + "learning_rate": 0.0002, + "loss": 1.7912, + "step": 1090 + }, + { + "epoch": 0.9738822487826472, + "grad_norm": 0.30650773644447327, + "learning_rate": 0.0002, + "loss": 1.8368, + "step": 1100 + }, + { + "epoch": 0.9827357237715804, + "grad_norm": 0.3330624997615814, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1110 + }, + { + "epoch": 0.9915891987605135, + "grad_norm": 0.3173314034938812, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 1120 + }, + { + "epoch": 0.9995573262505534, + "eval_loss": 1.8095673322677612, + "eval_runtime": 82.6312, + "eval_samples_per_second": 6.233, + "eval_steps_per_second": 0.787, + "step": 1129 + }, + { + "epoch": 1.0004426737494467, + "grad_norm": 0.3092995882034302, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1130 + }, + { + "epoch": 1.0092961487383798, + "grad_norm": 0.34386494755744934, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1140 + }, + { + "epoch": 1.0181496237273129, + "grad_norm": 0.2887897789478302, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 1150 + }, + { + "epoch": 1.0270030987162462, + "grad_norm": 0.3706893026828766, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1160 + }, + { + "epoch": 1.0358565737051793, + "grad_norm": 0.34724316000938416, + "learning_rate": 0.0002, + "loss": 1.6604, + "step": 1170 + }, + { + "epoch": 1.0447100486941125, + "grad_norm": 0.41001757979393005, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1180 + }, + { + "epoch": 1.0535635236830456, + "grad_norm": 0.34838348627090454, + "learning_rate": 0.0002, + "loss": 1.6332, + "step": 1190 + }, + { + "epoch": 1.0624169986719787, + "grad_norm": 0.37201181054115295, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1200 + }, + { + "epoch": 1.071270473660912, + "grad_norm": 0.36871352791786194, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1210 + }, + { + "epoch": 1.080123948649845, + "grad_norm": 0.35687458515167236, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 1220 + }, + { + "epoch": 1.0889774236387781, + "grad_norm": 0.3864741921424866, + "learning_rate": 0.0002, + "loss": 1.7235, + "step": 1230 + }, + { + "epoch": 1.0978308986277114, + "grad_norm": 0.3496808707714081, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1240 + }, + { + "epoch": 1.1066843736166445, + "grad_norm": 0.3444930911064148, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 1250 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.353188693523407, + "learning_rate": 0.0002, + "loss": 1.6672, + "step": 1260 + }, + { + "epoch": 1.1243913235945109, + "grad_norm": 0.3284400999546051, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1270 + }, + { + "epoch": 1.133244798583444, + "grad_norm": 0.3545348644256592, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 1280 + }, + { + "epoch": 1.1420982735723773, + "grad_norm": 0.3489900529384613, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1290 + }, + { + "epoch": 1.1509517485613103, + "grad_norm": 0.40355560183525085, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 1300 + }, + { + "epoch": 1.1598052235502434, + "grad_norm": 0.3369944095611572, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 1310 + }, + { + "epoch": 1.1686586985391767, + "grad_norm": 0.39141345024108887, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1320 + }, + { + "epoch": 1.1775121735281098, + "grad_norm": 0.36518552899360657, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 1330 + }, + { + "epoch": 1.1863656485170428, + "grad_norm": 0.3730056583881378, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 1340 + }, + { + "epoch": 1.1952191235059761, + "grad_norm": 0.37711501121520996, + "learning_rate": 0.0002, + "loss": 1.7613, + "step": 1350 + }, + { + "epoch": 1.2040725984949092, + "grad_norm": 0.3627128005027771, + "learning_rate": 0.0002, + "loss": 1.6423, + "step": 1360 + }, + { + "epoch": 1.2129260734838425, + "grad_norm": 0.3458651006221771, + "learning_rate": 0.0002, + "loss": 1.7214, + "step": 1370 + }, + { + "epoch": 1.2217795484727756, + "grad_norm": 0.392395555973053, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1380 + }, + { + "epoch": 1.2306330234617087, + "grad_norm": 0.3353286683559418, + "learning_rate": 0.0002, + "loss": 1.7785, + "step": 1390 + }, + { + "epoch": 1.239486498450642, + "grad_norm": 0.9545007944107056, + "learning_rate": 0.0002, + "loss": 1.7019, + "step": 1400 + }, + { + "epoch": 1.248339973439575, + "grad_norm": 0.37037935853004456, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1410 + }, + { + "epoch": 1.257193448428508, + "grad_norm": 0.3831497132778168, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1420 + }, + { + "epoch": 1.2660469234174414, + "grad_norm": 0.4633576273918152, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 1430 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.3690567910671234, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 1440 + }, + { + "epoch": 1.2837538733953076, + "grad_norm": 0.33980098366737366, + "learning_rate": 0.0002, + "loss": 1.767, + "step": 1450 + }, + { + "epoch": 1.2926073483842409, + "grad_norm": 0.3731277287006378, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1460 + }, + { + "epoch": 1.301460823373174, + "grad_norm": 0.3781551122665405, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 1470 + }, + { + "epoch": 1.310314298362107, + "grad_norm": 0.36511561274528503, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1480 + }, + { + "epoch": 1.3191677733510403, + "grad_norm": 0.3292245864868164, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1490 + }, + { + "epoch": 1.3280212483399734, + "grad_norm": 0.38758566975593567, + "learning_rate": 0.0002, + "loss": 1.7098, + "step": 1500 + }, + { + "epoch": 1.3368747233289067, + "grad_norm": 0.3993414044380188, + "learning_rate": 0.0002, + "loss": 1.7364, + "step": 1510 + }, + { + "epoch": 1.3457281983178397, + "grad_norm": 0.35689303278923035, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1520 + }, + { + "epoch": 1.354581673306773, + "grad_norm": 0.41849321126937866, + "learning_rate": 0.0002, + "loss": 1.7082, + "step": 1530 + }, + { + "epoch": 1.3634351482957061, + "grad_norm": 0.36752554774284363, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1540 + }, + { + "epoch": 1.3722886232846392, + "grad_norm": 0.36915940046310425, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 1550 + }, + { + "epoch": 1.3811420982735725, + "grad_norm": 0.3656710386276245, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1560 + }, + { + "epoch": 1.3899955732625056, + "grad_norm": 0.32055532932281494, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 1570 + }, + { + "epoch": 1.3988490482514386, + "grad_norm": 0.35031241178512573, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1580 + }, + { + "epoch": 1.407702523240372, + "grad_norm": 0.44541189074516296, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 1590 + }, + { + "epoch": 1.416555998229305, + "grad_norm": 0.36922356486320496, + "learning_rate": 0.0002, + "loss": 1.8624, + "step": 1600 + }, + { + "epoch": 1.425409473218238, + "grad_norm": 0.3470565974712372, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 1610 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.3743111193180084, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 1620 + }, + { + "epoch": 1.4431164231961044, + "grad_norm": 0.3619250953197479, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1630 + }, + { + "epoch": 1.4519698981850375, + "grad_norm": 0.4028145968914032, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 1640 + }, + { + "epoch": 1.4608233731739708, + "grad_norm": 0.36065351963043213, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1650 + }, + { + "epoch": 1.469676848162904, + "grad_norm": 0.44304442405700684, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 1660 + }, + { + "epoch": 1.478530323151837, + "grad_norm": 0.35770007967948914, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 1670 + }, + { + "epoch": 1.4873837981407703, + "grad_norm": 0.37584400177001953, + "learning_rate": 0.0002, + "loss": 1.7588, + "step": 1680 + }, + { + "epoch": 1.4962372731297033, + "grad_norm": 0.37151241302490234, + "learning_rate": 0.0002, + "loss": 1.63, + "step": 1690 + }, + { + "epoch": 1.5050907481186364, + "grad_norm": 0.36422812938690186, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1700 + }, + { + "epoch": 1.5139442231075697, + "grad_norm": 0.3680015206336975, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 1710 + }, + { + "epoch": 1.522797698096503, + "grad_norm": 0.3356926441192627, + "learning_rate": 0.0002, + "loss": 1.6917, + "step": 1720 + }, + { + "epoch": 1.531651173085436, + "grad_norm": 0.37887054681777954, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1730 + }, + { + "epoch": 1.5405046480743692, + "grad_norm": 0.37052762508392334, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1740 + }, + { + "epoch": 1.5493581230633025, + "grad_norm": 0.333925724029541, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 1750 + }, + { + "epoch": 1.5582115980522355, + "grad_norm": 0.3722778558731079, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1760 + }, + { + "epoch": 1.5670650730411686, + "grad_norm": 0.3331141173839569, + "learning_rate": 0.0002, + "loss": 1.6923, + "step": 1770 + }, + { + "epoch": 1.575918548030102, + "grad_norm": 0.3670045733451843, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1780 + }, + { + "epoch": 1.584772023019035, + "grad_norm": 0.3769885301589966, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1790 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.4266890287399292, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1800 + }, + { + "epoch": 1.6024789729969013, + "grad_norm": 0.37174347043037415, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1810 + }, + { + "epoch": 1.6113324479858344, + "grad_norm": 0.3599846363067627, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1820 + }, + { + "epoch": 1.6201859229747675, + "grad_norm": 0.3364820182323456, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1830 + }, + { + "epoch": 1.6290393979637008, + "grad_norm": 0.3874799907207489, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 1840 + }, + { + "epoch": 1.6378928729526339, + "grad_norm": 0.3706085681915283, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 1850 + }, + { + "epoch": 1.646746347941567, + "grad_norm": 0.3997809886932373, + "learning_rate": 0.0002, + "loss": 1.6761, + "step": 1860 + }, + { + "epoch": 1.6555998229305002, + "grad_norm": 0.4033166170120239, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 1870 + }, + { + "epoch": 1.6644532979194335, + "grad_norm": 0.3944370150566101, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1880 + }, + { + "epoch": 1.6733067729083664, + "grad_norm": 0.3467825651168823, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1890 + }, + { + "epoch": 1.6821602478972997, + "grad_norm": 0.35290950536727905, + "learning_rate": 0.0002, + "loss": 1.7462, + "step": 1900 + }, + { + "epoch": 1.691013722886233, + "grad_norm": 0.3664521872997284, + "learning_rate": 0.0002, + "loss": 1.7634, + "step": 1910 + }, + { + "epoch": 1.699867197875166, + "grad_norm": 0.33863595128059387, + "learning_rate": 0.0002, + "loss": 1.7922, + "step": 1920 + }, + { + "epoch": 1.7087206728640991, + "grad_norm": 0.34726113080978394, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 1930 + }, + { + "epoch": 1.7175741478530324, + "grad_norm": 0.35060688853263855, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1940 + }, + { + "epoch": 1.7264276228419655, + "grad_norm": 0.33741647005081177, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 1950 + }, + { + "epoch": 1.7352810978308986, + "grad_norm": 0.36190304160118103, + "learning_rate": 0.0002, + "loss": 1.6971, + "step": 1960 + }, + { + "epoch": 1.7441345728198319, + "grad_norm": 0.3412845730781555, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 1970 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 0.3841935694217682, + "learning_rate": 0.0002, + "loss": 1.7038, + "step": 1980 + }, + { + "epoch": 1.761841522797698, + "grad_norm": 0.39062076807022095, + "learning_rate": 0.0002, + "loss": 1.7185, + "step": 1990 + }, + { + "epoch": 1.7706949977866313, + "grad_norm": 0.3741697669029236, + "learning_rate": 0.0002, + "loss": 1.7346, + "step": 2000 + }, + { + "epoch": 1.7795484727755644, + "grad_norm": 0.4160231053829193, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2010 + }, + { + "epoch": 1.7884019477644975, + "grad_norm": 0.3602111339569092, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 2020 + }, + { + "epoch": 1.7972554227534308, + "grad_norm": 0.36740878224372864, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 2030 + }, + { + "epoch": 1.8061088977423638, + "grad_norm": 0.419039249420166, + "learning_rate": 0.0002, + "loss": 1.7043, + "step": 2040 + }, + { + "epoch": 1.814962372731297, + "grad_norm": 0.3511838912963867, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 2050 + }, + { + "epoch": 1.8238158477202302, + "grad_norm": 0.3580166697502136, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2060 + }, + { + "epoch": 1.8326693227091635, + "grad_norm": 0.40928223729133606, + "learning_rate": 0.0002, + "loss": 1.7562, + "step": 2070 + }, + { + "epoch": 1.8415227976980963, + "grad_norm": 0.37134310603141785, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 2080 + }, + { + "epoch": 1.8503762726870296, + "grad_norm": 0.3924112319946289, + "learning_rate": 0.0002, + "loss": 1.6829, + "step": 2090 + }, + { + "epoch": 1.859229747675963, + "grad_norm": 0.3215042054653168, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 2100 + }, + { + "epoch": 1.868083222664896, + "grad_norm": 0.37674015760421753, + "learning_rate": 0.0002, + "loss": 1.6864, + "step": 2110 + }, + { + "epoch": 1.876936697653829, + "grad_norm": 0.370856374502182, + "learning_rate": 0.0002, + "loss": 1.7313, + "step": 2120 + }, + { + "epoch": 1.8857901726427624, + "grad_norm": 0.35783782601356506, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 2130 + }, + { + "epoch": 1.8946436476316955, + "grad_norm": 0.39538058638572693, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 2140 + }, + { + "epoch": 1.9034971226206285, + "grad_norm": 0.36677780747413635, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 2150 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.39032700657844543, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 2160 + }, + { + "epoch": 1.921204072598495, + "grad_norm": 0.39762043952941895, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2170 + }, + { + "epoch": 1.930057547587428, + "grad_norm": 0.5400257110595703, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 2180 + }, + { + "epoch": 1.9389110225763613, + "grad_norm": 0.3650212287902832, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 2190 + }, + { + "epoch": 1.9477644975652944, + "grad_norm": 0.3583165109157562, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 2200 + }, + { + "epoch": 1.9566179725542274, + "grad_norm": 0.4031282365322113, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 2210 + }, + { + "epoch": 1.9654714475431607, + "grad_norm": 0.3673221170902252, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 2220 + }, + { + "epoch": 1.9743249225320938, + "grad_norm": 0.3920327126979828, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 2230 + }, + { + "epoch": 1.9831783975210269, + "grad_norm": 0.4765491783618927, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 2240 + }, + { + "epoch": 1.9920318725099602, + "grad_norm": 0.38130584359169006, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 2250 + }, + { + "epoch": 2.0, + "eval_loss": 1.8077166080474854, + "eval_runtime": 82.8351, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 0.785, + "step": 2259 + }, + { + "epoch": 2.0008853474988935, + "grad_norm": 0.34340235590934753, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 2260 + }, + { + "epoch": 2.0097388224878263, + "grad_norm": 0.3710762858390808, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2270 + }, + { + "epoch": 2.0185922974767596, + "grad_norm": 0.35640114545822144, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2280 + }, + { + "epoch": 2.027445772465693, + "grad_norm": 0.45970189571380615, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 2290 + }, + { + "epoch": 2.0362992474546258, + "grad_norm": 0.4256797134876251, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 2300 + }, + { + "epoch": 2.045152722443559, + "grad_norm": 0.42421531677246094, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2310 + }, + { + "epoch": 2.0540061974324924, + "grad_norm": 0.4032478928565979, + "learning_rate": 0.0002, + "loss": 1.6117, + "step": 2320 + }, + { + "epoch": 2.062859672421425, + "grad_norm": 0.4073623716831207, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 2330 + }, + { + "epoch": 2.0717131474103585, + "grad_norm": 0.4845200777053833, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2340 + }, + { + "epoch": 2.080566622399292, + "grad_norm": 0.40578293800354004, + "learning_rate": 0.0002, + "loss": 1.5734, + "step": 2350 + }, + { + "epoch": 2.089420097388225, + "grad_norm": 0.4037284255027771, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2360 + }, + { + "epoch": 2.098273572377158, + "grad_norm": 0.4717613160610199, + "learning_rate": 0.0002, + "loss": 1.6511, + "step": 2370 + }, + { + "epoch": 2.1071270473660912, + "grad_norm": 0.42076411843299866, + "learning_rate": 0.0002, + "loss": 1.6273, + "step": 2380 + }, + { + "epoch": 2.1159805223550245, + "grad_norm": 0.47799113392829895, + "learning_rate": 0.0002, + "loss": 1.654, + "step": 2390 + }, + { + "epoch": 2.1248339973439574, + "grad_norm": 0.4253084063529968, + "learning_rate": 0.0002, + "loss": 1.5528, + "step": 2400 + }, + { + "epoch": 2.1336874723328907, + "grad_norm": 0.5023085474967957, + "learning_rate": 0.0002, + "loss": 1.6432, + "step": 2410 + }, + { + "epoch": 2.142540947321824, + "grad_norm": 0.49162712693214417, + "learning_rate": 0.0002, + "loss": 1.5926, + "step": 2420 + }, + { + "epoch": 2.151394422310757, + "grad_norm": 0.39035019278526306, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 2430 + }, + { + "epoch": 2.16024789729969, + "grad_norm": 0.43223854899406433, + "learning_rate": 0.0002, + "loss": 1.7526, + "step": 2440 + }, + { + "epoch": 2.1691013722886234, + "grad_norm": 0.4596616327762604, + "learning_rate": 0.0002, + "loss": 1.6334, + "step": 2450 + }, + { + "epoch": 2.1779548472775563, + "grad_norm": 0.4469447731971741, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 2460 + }, + { + "epoch": 2.1868083222664896, + "grad_norm": 0.5100595355033875, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 2470 + }, + { + "epoch": 2.195661797255423, + "grad_norm": 0.4169430732727051, + "learning_rate": 0.0002, + "loss": 1.6456, + "step": 2480 + }, + { + "epoch": 2.2045152722443557, + "grad_norm": 0.4699254035949707, + "learning_rate": 0.0002, + "loss": 1.6734, + "step": 2490 + }, + { + "epoch": 2.213368747233289, + "grad_norm": 0.43524250388145447, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 2500 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.4496648907661438, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2510 + }, + { + "epoch": 2.231075697211155, + "grad_norm": 0.43408212065696716, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2520 + }, + { + "epoch": 2.2399291722000885, + "grad_norm": 0.4596034288406372, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 2530 + }, + { + "epoch": 2.2487826471890218, + "grad_norm": 0.5217021107673645, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 2540 + }, + { + "epoch": 2.2576361221779546, + "grad_norm": 0.44745638966560364, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 2550 + }, + { + "epoch": 2.266489597166888, + "grad_norm": 0.4484798014163971, + "learning_rate": 0.0002, + "loss": 1.675, + "step": 2560 + }, + { + "epoch": 2.275343072155821, + "grad_norm": 0.4428067207336426, + "learning_rate": 0.0002, + "loss": 1.5321, + "step": 2570 + }, + { + "epoch": 2.2841965471447545, + "grad_norm": 0.5095171332359314, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2580 + }, + { + "epoch": 2.2930500221336874, + "grad_norm": 0.44833096861839294, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 2590 + }, + { + "epoch": 2.3019034971226207, + "grad_norm": 0.507905900478363, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 2600 + }, + { + "epoch": 2.310756972111554, + "grad_norm": 0.40808171033859253, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 2610 + }, + { + "epoch": 2.319610447100487, + "grad_norm": 0.4684814214706421, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 2620 + }, + { + "epoch": 2.32846392208942, + "grad_norm": 0.44864922761917114, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2630 + }, + { + "epoch": 2.3373173970783534, + "grad_norm": 0.4174162745475769, + "learning_rate": 0.0002, + "loss": 1.5828, + "step": 2640 + }, + { + "epoch": 2.3461708720672863, + "grad_norm": 0.42314743995666504, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 2650 + }, + { + "epoch": 2.3550243470562195, + "grad_norm": 0.49224185943603516, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 2660 + }, + { + "epoch": 2.363877822045153, + "grad_norm": 0.45190292596817017, + "learning_rate": 0.0002, + "loss": 1.5766, + "step": 2670 + }, + { + "epoch": 2.3727312970340857, + "grad_norm": 0.41817107796669006, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 2680 + }, + { + "epoch": 2.381584772023019, + "grad_norm": 0.6436763405799866, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2690 + }, + { + "epoch": 2.3904382470119523, + "grad_norm": 0.47175949811935425, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 2700 + }, + { + "epoch": 2.3992917220008856, + "grad_norm": 0.480339378118515, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 2710 + }, + { + "epoch": 2.4081451969898184, + "grad_norm": 0.4723486006259918, + "learning_rate": 0.0002, + "loss": 1.5697, + "step": 2720 + }, + { + "epoch": 2.4169986719787517, + "grad_norm": 0.4305492043495178, + "learning_rate": 0.0002, + "loss": 1.54, + "step": 2730 + }, + { + "epoch": 2.425852146967685, + "grad_norm": 0.5007492303848267, + "learning_rate": 0.0002, + "loss": 1.71, + "step": 2740 + }, + { + "epoch": 2.434705621956618, + "grad_norm": 0.5374062061309814, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 2750 + }, + { + "epoch": 2.443559096945551, + "grad_norm": 0.45866212248802185, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 2760 + }, + { + "epoch": 2.4524125719344845, + "grad_norm": 0.47914502024650574, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 2770 + }, + { + "epoch": 2.4612660469234173, + "grad_norm": 0.43804746866226196, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 2780 + }, + { + "epoch": 2.4701195219123506, + "grad_norm": 0.43656906485557556, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 2790 + }, + { + "epoch": 2.478972996901284, + "grad_norm": 0.4820363521575928, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 2800 + }, + { + "epoch": 2.4878264718902168, + "grad_norm": 0.4916800558567047, + "learning_rate": 0.0002, + "loss": 1.5375, + "step": 2810 + }, + { + "epoch": 2.49667994687915, + "grad_norm": 0.4521256983280182, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2820 + }, + { + "epoch": 2.5055334218680834, + "grad_norm": 0.5066806674003601, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 2830 + }, + { + "epoch": 2.514386896857016, + "grad_norm": 0.4768151640892029, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 2840 + }, + { + "epoch": 2.5232403718459495, + "grad_norm": 0.5144683718681335, + "learning_rate": 0.0002, + "loss": 1.6719, + "step": 2850 + }, + { + "epoch": 2.532093846834883, + "grad_norm": 0.4718942940235138, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2860 + }, + { + "epoch": 2.5409473218238157, + "grad_norm": 0.4924587309360504, + "learning_rate": 0.0002, + "loss": 1.6099, + "step": 2870 + }, + { + "epoch": 2.549800796812749, + "grad_norm": 0.4649953842163086, + "learning_rate": 0.0002, + "loss": 1.5994, + "step": 2880 + }, + { + "epoch": 2.5586542718016823, + "grad_norm": 0.4836665987968445, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2890 + }, + { + "epoch": 2.567507746790615, + "grad_norm": 0.4162124991416931, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 2900 + }, + { + "epoch": 2.5763612217795484, + "grad_norm": 0.4894537925720215, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2910 + }, + { + "epoch": 2.5852146967684817, + "grad_norm": 0.4539397358894348, + "learning_rate": 0.0002, + "loss": 1.6123, + "step": 2920 + }, + { + "epoch": 2.5940681717574146, + "grad_norm": 0.4718773066997528, + "learning_rate": 0.0002, + "loss": 1.6449, + "step": 2930 + }, + { + "epoch": 2.602921646746348, + "grad_norm": 0.49989837408065796, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 2940 + }, + { + "epoch": 2.611775121735281, + "grad_norm": 0.4862406849861145, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 2950 + }, + { + "epoch": 2.620628596724214, + "grad_norm": 0.4244804382324219, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 2960 + }, + { + "epoch": 2.6294820717131473, + "grad_norm": 0.49304354190826416, + "learning_rate": 0.0002, + "loss": 1.7795, + "step": 2970 + }, + { + "epoch": 2.6383355467020806, + "grad_norm": 0.4818236529827118, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 2980 + }, + { + "epoch": 2.647189021691014, + "grad_norm": 0.5077425837516785, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 2990 + }, + { + "epoch": 2.6560424966799467, + "grad_norm": 0.4494157135486603, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 3000 + }, + { + "epoch": 2.66489597166888, + "grad_norm": 0.4790278971195221, + "learning_rate": 0.0002, + "loss": 1.6792, + "step": 3010 + }, + { + "epoch": 2.6737494466578133, + "grad_norm": 0.4702624976634979, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3020 + }, + { + "epoch": 2.682602921646746, + "grad_norm": 0.5082133412361145, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3030 + }, + { + "epoch": 2.6914563966356795, + "grad_norm": 0.4553256630897522, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 3040 + }, + { + "epoch": 2.700309871624613, + "grad_norm": 0.4492715001106262, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3050 + }, + { + "epoch": 2.709163346613546, + "grad_norm": 0.4555944502353668, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 3060 + }, + { + "epoch": 2.718016821602479, + "grad_norm": 0.5879693031311035, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 3070 + }, + { + "epoch": 2.7268702965914122, + "grad_norm": 0.4628562927246094, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3080 + }, + { + "epoch": 2.7357237715803455, + "grad_norm": 0.5169575810432434, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 3090 + }, + { + "epoch": 2.7445772465692784, + "grad_norm": 0.4630090892314911, + "learning_rate": 0.0002, + "loss": 1.562, + "step": 3100 + }, + { + "epoch": 2.7534307215582117, + "grad_norm": 0.5437219738960266, + "learning_rate": 0.0002, + "loss": 1.5508, + "step": 3110 + }, + { + "epoch": 2.762284196547145, + "grad_norm": 0.5102152228355408, + "learning_rate": 0.0002, + "loss": 1.6442, + "step": 3120 + }, + { + "epoch": 2.771137671536078, + "grad_norm": 0.48287826776504517, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3130 + }, + { + "epoch": 2.779991146525011, + "grad_norm": 0.4671737253665924, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 3140 + }, + { + "epoch": 2.7888446215139444, + "grad_norm": 0.5177035331726074, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 3150 + }, + { + "epoch": 2.7976980965028773, + "grad_norm": 0.450989305973053, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 3160 + }, + { + "epoch": 2.8065515714918106, + "grad_norm": 0.45007848739624023, + "learning_rate": 0.0002, + "loss": 1.597, + "step": 3170 + }, + { + "epoch": 2.815405046480744, + "grad_norm": 0.4600294530391693, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3180 + }, + { + "epoch": 2.8242585214696767, + "grad_norm": 0.485628604888916, + "learning_rate": 0.0002, + "loss": 1.6441, + "step": 3190 + }, + { + "epoch": 2.83311199645861, + "grad_norm": 0.49811574816703796, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 3200 + }, + { + "epoch": 2.8419654714475433, + "grad_norm": 0.5012516975402832, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 3210 + }, + { + "epoch": 2.850818946436476, + "grad_norm": 0.4552757740020752, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 3220 + }, + { + "epoch": 2.8596724214254094, + "grad_norm": 0.4539635479450226, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 3230 + }, + { + "epoch": 2.8685258964143427, + "grad_norm": 0.5534685850143433, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 3240 + }, + { + "epoch": 2.8773793714032756, + "grad_norm": 0.4570811688899994, + "learning_rate": 0.0002, + "loss": 1.6065, + "step": 3250 + }, + { + "epoch": 2.886232846392209, + "grad_norm": 0.48181653022766113, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 3260 + }, + { + "epoch": 2.895086321381142, + "grad_norm": 0.4871032238006592, + "learning_rate": 0.0002, + "loss": 1.6574, + "step": 3270 + }, + { + "epoch": 2.903939796370075, + "grad_norm": 0.4643239676952362, + "learning_rate": 0.0002, + "loss": 1.5626, + "step": 3280 + }, + { + "epoch": 2.9127932713590083, + "grad_norm": 0.5024484395980835, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3290 + }, + { + "epoch": 2.9216467463479416, + "grad_norm": 0.4425384998321533, + "learning_rate": 0.0002, + "loss": 1.5756, + "step": 3300 + }, + { + "epoch": 2.9305002213368745, + "grad_norm": 0.459168016910553, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 3310 + }, + { + "epoch": 2.939353696325808, + "grad_norm": 0.4950717091560364, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 3320 + }, + { + "epoch": 2.948207171314741, + "grad_norm": 0.4516230523586273, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3330 + }, + { + "epoch": 2.957060646303674, + "grad_norm": 0.49523285031318665, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 3340 + }, + { + "epoch": 2.9659141212926072, + "grad_norm": 0.49282631278038025, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 3350 + }, + { + "epoch": 2.9747675962815405, + "grad_norm": 0.45825016498565674, + "learning_rate": 0.0002, + "loss": 1.6519, + "step": 3360 + }, + { + "epoch": 2.983621071270474, + "grad_norm": 0.4952891170978546, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 3370 + }, + { + "epoch": 2.9924745462594067, + "grad_norm": 0.42182639241218567, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 3380 + }, + { + "epoch": 2.9995573262505535, + "eval_loss": 1.8308420181274414, + "eval_runtime": 82.786, + "eval_samples_per_second": 6.221, + "eval_steps_per_second": 0.785, + "step": 3388 + }, + { + "epoch": 3.00132802124834, + "grad_norm": 0.47721418738365173, + "learning_rate": 0.0002, + "loss": 1.5811, + "step": 3390 + }, + { + "epoch": 3.0101814962372733, + "grad_norm": 0.5284923911094666, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 3400 + }, + { + "epoch": 3.019034971226206, + "grad_norm": 0.5607061982154846, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 3410 + }, + { + "epoch": 3.0278884462151394, + "grad_norm": 0.5271363258361816, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 3420 + }, + { + "epoch": 3.0367419212040727, + "grad_norm": 0.48660898208618164, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 3430 + }, + { + "epoch": 3.0455953961930056, + "grad_norm": 0.5767933130264282, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 3440 + }, + { + "epoch": 3.054448871181939, + "grad_norm": 0.5591282248497009, + "learning_rate": 0.0002, + "loss": 1.4647, + "step": 3450 + }, + { + "epoch": 3.063302346170872, + "grad_norm": 0.5870814323425293, + "learning_rate": 0.0002, + "loss": 1.5112, + "step": 3460 + }, + { + "epoch": 3.072155821159805, + "grad_norm": 0.4861546456813812, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 3470 + }, + { + "epoch": 3.0810092961487383, + "grad_norm": 0.5238925814628601, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 3480 + }, + { + "epoch": 3.0898627711376716, + "grad_norm": 0.5521751046180725, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 3490 + }, + { + "epoch": 3.098716246126605, + "grad_norm": 0.5816575884819031, + "learning_rate": 0.0002, + "loss": 1.4454, + "step": 3500 + }, + { + "epoch": 3.1075697211155378, + "grad_norm": 0.5281513333320618, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 3510 + }, + { + "epoch": 3.116423196104471, + "grad_norm": 0.5847303867340088, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 3520 + }, + { + "epoch": 3.1252766710934043, + "grad_norm": 0.5683517456054688, + "learning_rate": 0.0002, + "loss": 1.5513, + "step": 3530 + }, + { + "epoch": 3.134130146082337, + "grad_norm": 0.5177015662193298, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 3540 + }, + { + "epoch": 3.1429836210712705, + "grad_norm": 0.5922423601150513, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 3550 + }, + { + "epoch": 3.151837096060204, + "grad_norm": 0.7018587589263916, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 3560 + }, + { + "epoch": 3.1606905710491366, + "grad_norm": 0.6152004599571228, + "learning_rate": 0.0002, + "loss": 1.4677, + "step": 3570 + }, + { + "epoch": 3.16954404603807, + "grad_norm": 0.5350717902183533, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 3580 + }, + { + "epoch": 3.1783975210270032, + "grad_norm": 0.5971009731292725, + "learning_rate": 0.0002, + "loss": 1.4739, + "step": 3590 + }, + { + "epoch": 3.187250996015936, + "grad_norm": 0.7312001585960388, + "learning_rate": 0.0002, + "loss": 1.541, + "step": 3600 + }, + { + "epoch": 3.1961044710048694, + "grad_norm": 0.6372535228729248, + "learning_rate": 0.0002, + "loss": 1.5803, + "step": 3610 + }, + { + "epoch": 3.2049579459938027, + "grad_norm": 0.6098020672798157, + "learning_rate": 0.0002, + "loss": 1.4642, + "step": 3620 + }, + { + "epoch": 3.2138114209827355, + "grad_norm": 0.5506435632705688, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3630 + }, + { + "epoch": 3.222664895971669, + "grad_norm": 0.6043022274971008, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3640 + }, + { + "epoch": 3.231518370960602, + "grad_norm": 0.5495519042015076, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3650 + }, + { + "epoch": 3.240371845949535, + "grad_norm": 0.5769572257995605, + "learning_rate": 0.0002, + "loss": 1.3879, + "step": 3660 + }, + { + "epoch": 3.2492253209384683, + "grad_norm": 0.6833786964416504, + "learning_rate": 0.0002, + "loss": 1.4604, + "step": 3670 + }, + { + "epoch": 3.2580787959274016, + "grad_norm": 0.6962856650352478, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3680 + }, + { + "epoch": 3.2669322709163344, + "grad_norm": 0.6553098559379578, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 3690 + }, + { + "epoch": 3.2757857459052677, + "grad_norm": 0.5907557010650635, + "learning_rate": 0.0002, + "loss": 1.5416, + "step": 3700 + }, + { + "epoch": 3.284639220894201, + "grad_norm": 0.5712862014770508, + "learning_rate": 0.0002, + "loss": 1.5012, + "step": 3710 + }, + { + "epoch": 3.2934926958831343, + "grad_norm": 0.573820948600769, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 3720 + }, + { + "epoch": 3.302346170872067, + "grad_norm": 0.6650304198265076, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 3730 + }, + { + "epoch": 3.3111996458610005, + "grad_norm": 0.5182583928108215, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 3740 + }, + { + "epoch": 3.3200531208499338, + "grad_norm": 0.5078902840614319, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 3750 + }, + { + "epoch": 3.3289065958388666, + "grad_norm": 0.7062374353408813, + "learning_rate": 0.0002, + "loss": 1.4881, + "step": 3760 + }, + { + "epoch": 3.3377600708278, + "grad_norm": 0.5711262822151184, + "learning_rate": 0.0002, + "loss": 1.5017, + "step": 3770 + }, + { + "epoch": 3.346613545816733, + "grad_norm": 0.5624606013298035, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 3780 + }, + { + "epoch": 3.355467020805666, + "grad_norm": 0.6008231043815613, + "learning_rate": 0.0002, + "loss": 1.4515, + "step": 3790 + }, + { + "epoch": 3.3643204957945994, + "grad_norm": 0.6120018362998962, + "learning_rate": 0.0002, + "loss": 1.5038, + "step": 3800 + }, + { + "epoch": 3.3731739707835326, + "grad_norm": 0.5679979920387268, + "learning_rate": 0.0002, + "loss": 1.4918, + "step": 3810 + }, + { + "epoch": 3.3820274457724655, + "grad_norm": 0.5613794922828674, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3820 + }, + { + "epoch": 3.390880920761399, + "grad_norm": 0.5328839421272278, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 3830 + }, + { + "epoch": 3.399734395750332, + "grad_norm": 0.5960017442703247, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 3840 + }, + { + "epoch": 3.4085878707392654, + "grad_norm": 0.5264106392860413, + "learning_rate": 0.0002, + "loss": 1.4227, + "step": 3850 + }, + { + "epoch": 3.4174413457281982, + "grad_norm": 0.6378359198570251, + "learning_rate": 0.0002, + "loss": 1.4766, + "step": 3860 + }, + { + "epoch": 3.4262948207171315, + "grad_norm": 0.5792967677116394, + "learning_rate": 0.0002, + "loss": 1.4898, + "step": 3870 + }, + { + "epoch": 3.435148295706065, + "grad_norm": 0.6836280822753906, + "learning_rate": 0.0002, + "loss": 1.4914, + "step": 3880 + }, + { + "epoch": 3.4440017706949977, + "grad_norm": 0.6073971390724182, + "learning_rate": 0.0002, + "loss": 1.5002, + "step": 3890 + }, + { + "epoch": 3.452855245683931, + "grad_norm": 0.5753195881843567, + "learning_rate": 0.0002, + "loss": 1.4473, + "step": 3900 + }, + { + "epoch": 3.4617087206728643, + "grad_norm": 0.6007646918296814, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3910 + }, + { + "epoch": 3.470562195661797, + "grad_norm": 0.6025636196136475, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3920 + }, + { + "epoch": 3.4794156706507304, + "grad_norm": 0.6819562315940857, + "learning_rate": 0.0002, + "loss": 1.4612, + "step": 3930 + }, + { + "epoch": 3.4882691456396637, + "grad_norm": 0.6448395848274231, + "learning_rate": 0.0002, + "loss": 1.518, + "step": 3940 + }, + { + "epoch": 3.4971226206285966, + "grad_norm": 0.5712178945541382, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 3950 + }, + { + "epoch": 3.50597609561753, + "grad_norm": 0.6300532817840576, + "learning_rate": 0.0002, + "loss": 1.4757, + "step": 3960 + }, + { + "epoch": 3.514829570606463, + "grad_norm": 0.6120840907096863, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 3970 + }, + { + "epoch": 3.523683045595396, + "grad_norm": 0.6887575387954712, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 3980 + }, + { + "epoch": 3.5325365205843293, + "grad_norm": 0.6970235109329224, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 3990 + }, + { + "epoch": 3.5413899955732626, + "grad_norm": 0.5818213820457458, + "learning_rate": 0.0002, + "loss": 1.5198, + "step": 4000 + }, + { + "epoch": 3.5502434705621955, + "grad_norm": 1.0533310174942017, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 4010 + }, + { + "epoch": 3.5590969455511288, + "grad_norm": 0.5444280505180359, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4020 + }, + { + "epoch": 3.567950420540062, + "grad_norm": 0.6007506847381592, + "learning_rate": 0.0002, + "loss": 1.5573, + "step": 4030 + }, + { + "epoch": 3.576803895528995, + "grad_norm": 0.6088743805885315, + "learning_rate": 0.0002, + "loss": 1.5059, + "step": 4040 + }, + { + "epoch": 3.585657370517928, + "grad_norm": 0.5934239029884338, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 4050 + }, + { + "epoch": 3.5945108455068615, + "grad_norm": 0.605251669883728, + "learning_rate": 0.0002, + "loss": 1.4938, + "step": 4060 + }, + { + "epoch": 3.6033643204957944, + "grad_norm": 0.5903469920158386, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 4070 + }, + { + "epoch": 3.6122177954847277, + "grad_norm": 0.6752413511276245, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4080 + }, + { + "epoch": 3.621071270473661, + "grad_norm": 0.5810418725013733, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4090 + }, + { + "epoch": 3.629924745462594, + "grad_norm": 0.5918573141098022, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 4100 + }, + { + "epoch": 3.638778220451527, + "grad_norm": 0.6635358333587646, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 4110 + }, + { + "epoch": 3.6476316954404604, + "grad_norm": 0.5785038471221924, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4120 + }, + { + "epoch": 3.6564851704293937, + "grad_norm": 0.5837879776954651, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 4130 + }, + { + "epoch": 3.6653386454183265, + "grad_norm": 0.6449324488639832, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4140 + }, + { + "epoch": 3.67419212040726, + "grad_norm": 0.6191908717155457, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4150 + }, + { + "epoch": 3.683045595396193, + "grad_norm": 0.6937987208366394, + "learning_rate": 0.0002, + "loss": 1.4567, + "step": 4160 + }, + { + "epoch": 3.6918990703851264, + "grad_norm": 0.581128716468811, + "learning_rate": 0.0002, + "loss": 1.4136, + "step": 4170 + }, + { + "epoch": 3.7007525453740593, + "grad_norm": 0.6547803282737732, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 4180 + }, + { + "epoch": 3.7096060203629926, + "grad_norm": 0.5961150527000427, + "learning_rate": 0.0002, + "loss": 1.4653, + "step": 4190 + }, + { + "epoch": 3.718459495351926, + "grad_norm": 0.6197913885116577, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 4200 + }, + { + "epoch": 3.7273129703408587, + "grad_norm": 0.688565194606781, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 4210 + }, + { + "epoch": 3.736166445329792, + "grad_norm": 0.5832270979881287, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 4220 + }, + { + "epoch": 3.7450199203187253, + "grad_norm": 0.5643884539604187, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 4230 + }, + { + "epoch": 3.753873395307658, + "grad_norm": 0.6236484050750732, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4240 + }, + { + "epoch": 3.7627268702965915, + "grad_norm": 0.5367720127105713, + "learning_rate": 0.0002, + "loss": 1.576, + "step": 4250 + }, + { + "epoch": 3.7715803452855248, + "grad_norm": 0.5785109400749207, + "learning_rate": 0.0002, + "loss": 1.5234, + "step": 4260 + }, + { + "epoch": 3.7804338202744576, + "grad_norm": 0.5698465704917908, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4270 + }, + { + "epoch": 3.789287295263391, + "grad_norm": 0.5748036503791809, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4280 + }, + { + "epoch": 3.798140770252324, + "grad_norm": 0.608147382736206, + "learning_rate": 0.0002, + "loss": 1.5503, + "step": 4290 + }, + { + "epoch": 3.806994245241257, + "grad_norm": 0.5820456147193909, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 4300 + }, + { + "epoch": 3.8158477202301904, + "grad_norm": 0.6325612664222717, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4310 + }, + { + "epoch": 3.8247011952191237, + "grad_norm": 0.6465362310409546, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4320 + }, + { + "epoch": 3.8335546702080565, + "grad_norm": 0.5630854368209839, + "learning_rate": 0.0002, + "loss": 1.5048, + "step": 4330 + }, + { + "epoch": 3.84240814519699, + "grad_norm": 0.6181462407112122, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4340 + }, + { + "epoch": 3.851261620185923, + "grad_norm": 0.6207571029663086, + "learning_rate": 0.0002, + "loss": 1.5113, + "step": 4350 + }, + { + "epoch": 3.860115095174856, + "grad_norm": 0.6092919111251831, + "learning_rate": 0.0002, + "loss": 1.5424, + "step": 4360 + }, + { + "epoch": 3.8689685701637893, + "grad_norm": 0.6140493750572205, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 4370 + }, + { + "epoch": 3.8778220451527226, + "grad_norm": 0.611575722694397, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 4380 + }, + { + "epoch": 3.8866755201416554, + "grad_norm": 0.6288794279098511, + "learning_rate": 0.0002, + "loss": 1.5563, + "step": 4390 + }, + { + "epoch": 3.8955289951305887, + "grad_norm": 0.6518979072570801, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4400 + }, + { + "epoch": 3.904382470119522, + "grad_norm": 0.6144753098487854, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4410 + }, + { + "epoch": 3.913235945108455, + "grad_norm": 0.7034937143325806, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 4420 + }, + { + "epoch": 3.922089420097388, + "grad_norm": 0.5713187456130981, + "learning_rate": 0.0002, + "loss": 1.4978, + "step": 4430 + }, + { + "epoch": 3.9309428950863214, + "grad_norm": 0.6187576651573181, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 4440 + }, + { + "epoch": 3.9397963700752543, + "grad_norm": 0.6439383029937744, + "learning_rate": 0.0002, + "loss": 1.551, + "step": 4450 + }, + { + "epoch": 3.9486498450641876, + "grad_norm": 0.6133334636688232, + "learning_rate": 0.0002, + "loss": 1.5073, + "step": 4460 + }, + { + "epoch": 3.957503320053121, + "grad_norm": 0.593463659286499, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 4470 + }, + { + "epoch": 3.9663567950420537, + "grad_norm": 0.6261998414993286, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 4480 + }, + { + "epoch": 3.975210270030987, + "grad_norm": 0.6153767704963684, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 4490 + }, + { + "epoch": 3.9840637450199203, + "grad_norm": 0.6184002757072449, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 4500 + }, + { + "epoch": 3.9929172200088536, + "grad_norm": 0.5212734341621399, + "learning_rate": 0.0002, + "loss": 1.5134, + "step": 4510 + }, + { + "epoch": 4.0, + "eval_loss": 1.8745536804199219, + "eval_runtime": 83.0125, + "eval_samples_per_second": 6.204, + "eval_steps_per_second": 0.783, + "step": 4518 + }, + { + "epoch": 4.001770694997787, + "grad_norm": 0.5871603488922119, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 4520 + }, + { + "epoch": 4.01062416998672, + "grad_norm": 0.6746091842651367, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 4530 + }, + { + "epoch": 4.019477644975653, + "grad_norm": 0.6159639358520508, + "learning_rate": 0.0002, + "loss": 1.3625, + "step": 4540 + }, + { + "epoch": 4.028331119964586, + "grad_norm": 0.7529398202896118, + "learning_rate": 0.0002, + "loss": 1.3766, + "step": 4550 + }, + { + "epoch": 4.037184594953519, + "grad_norm": 0.788398027420044, + "learning_rate": 0.0002, + "loss": 1.3202, + "step": 4560 + }, + { + "epoch": 4.046038069942452, + "grad_norm": 0.9679850935935974, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 4570 + }, + { + "epoch": 4.054891544931386, + "grad_norm": 0.6305310130119324, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 4580 + }, + { + "epoch": 4.063745019920319, + "grad_norm": 0.8557451963424683, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 4590 + }, + { + "epoch": 4.0725984949092515, + "grad_norm": 0.741518497467041, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4600 + }, + { + "epoch": 4.081451969898185, + "grad_norm": 0.6573862433433533, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 4610 + }, + { + "epoch": 4.090305444887118, + "grad_norm": 0.6926319599151611, + "learning_rate": 0.0002, + "loss": 1.3341, + "step": 4620 + }, + { + "epoch": 4.099158919876051, + "grad_norm": 0.9212626218795776, + "learning_rate": 0.0002, + "loss": 1.4176, + "step": 4630 + }, + { + "epoch": 4.108012394864985, + "grad_norm": 0.7167867422103882, + "learning_rate": 0.0002, + "loss": 1.3402, + "step": 4640 + }, + { + "epoch": 4.116865869853918, + "grad_norm": 0.6691595911979675, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 4650 + }, + { + "epoch": 4.12571934484285, + "grad_norm": 0.8708247542381287, + "learning_rate": 0.0002, + "loss": 1.247, + "step": 4660 + }, + { + "epoch": 4.134572819831784, + "grad_norm": 0.8612170219421387, + "learning_rate": 0.0002, + "loss": 1.3599, + "step": 4670 + }, + { + "epoch": 4.143426294820717, + "grad_norm": 0.7688325047492981, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 4680 + }, + { + "epoch": 4.152279769809651, + "grad_norm": 0.7606917023658752, + "learning_rate": 0.0002, + "loss": 1.4349, + "step": 4690 + }, + { + "epoch": 4.161133244798584, + "grad_norm": 0.8241282105445862, + "learning_rate": 0.0002, + "loss": 1.3521, + "step": 4700 + }, + { + "epoch": 4.1699867197875164, + "grad_norm": 0.7480464577674866, + "learning_rate": 0.0002, + "loss": 1.3325, + "step": 4710 + }, + { + "epoch": 4.17884019477645, + "grad_norm": 0.7092460989952087, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 4720 + }, + { + "epoch": 4.187693669765383, + "grad_norm": 0.8782108426094055, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 4730 + }, + { + "epoch": 4.196547144754316, + "grad_norm": 0.6875300407409668, + "learning_rate": 0.0002, + "loss": 1.3626, + "step": 4740 + }, + { + "epoch": 4.20540061974325, + "grad_norm": 0.7713887691497803, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 4750 + }, + { + "epoch": 4.2142540947321825, + "grad_norm": 0.8270819783210754, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 4760 + }, + { + "epoch": 4.223107569721115, + "grad_norm": 0.7109288573265076, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 4770 + }, + { + "epoch": 4.231961044710049, + "grad_norm": 0.7209359407424927, + "learning_rate": 0.0002, + "loss": 1.3948, + "step": 4780 + }, + { + "epoch": 4.240814519698982, + "grad_norm": 0.7142833471298218, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 4790 + }, + { + "epoch": 4.249667994687915, + "grad_norm": 0.8526809811592102, + "learning_rate": 0.0002, + "loss": 1.3654, + "step": 4800 + }, + { + "epoch": 4.2585214696768485, + "grad_norm": 0.7064695954322815, + "learning_rate": 0.0002, + "loss": 1.3819, + "step": 4810 + }, + { + "epoch": 4.267374944665781, + "grad_norm": 0.7646124362945557, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 4820 + }, + { + "epoch": 4.276228419654714, + "grad_norm": 0.7377115488052368, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 4830 + }, + { + "epoch": 4.285081894643648, + "grad_norm": 0.7308453321456909, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 4840 + }, + { + "epoch": 4.293935369632581, + "grad_norm": 0.6687684059143066, + "learning_rate": 0.0002, + "loss": 1.3653, + "step": 4850 + }, + { + "epoch": 4.302788844621514, + "grad_norm": 0.7447634339332581, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 4860 + }, + { + "epoch": 4.311642319610447, + "grad_norm": 0.7661601305007935, + "learning_rate": 0.0002, + "loss": 1.3842, + "step": 4870 + }, + { + "epoch": 4.32049579459938, + "grad_norm": 0.7492215037345886, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 4880 + }, + { + "epoch": 4.329349269588313, + "grad_norm": 0.9554458856582642, + "learning_rate": 0.0002, + "loss": 1.4089, + "step": 4890 + }, + { + "epoch": 4.338202744577247, + "grad_norm": 0.7409822940826416, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 4900 + }, + { + "epoch": 4.34705621956618, + "grad_norm": 0.9848645329475403, + "learning_rate": 0.0002, + "loss": 1.2581, + "step": 4910 + }, + { + "epoch": 4.355909694555113, + "grad_norm": 0.803995668888092, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4920 + }, + { + "epoch": 4.364763169544046, + "grad_norm": 0.7480606436729431, + "learning_rate": 0.0002, + "loss": 1.3585, + "step": 4930 + }, + { + "epoch": 4.373616644532979, + "grad_norm": 0.7018141150474548, + "learning_rate": 0.0002, + "loss": 1.4092, + "step": 4940 + }, + { + "epoch": 4.382470119521912, + "grad_norm": 0.7684932351112366, + "learning_rate": 0.0002, + "loss": 1.4034, + "step": 4950 + }, + { + "epoch": 4.391323594510846, + "grad_norm": 0.7849185466766357, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 4960 + }, + { + "epoch": 4.400177069499779, + "grad_norm": 0.7858862280845642, + "learning_rate": 0.0002, + "loss": 1.3763, + "step": 4970 + }, + { + "epoch": 4.4090305444887115, + "grad_norm": 0.8270778059959412, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 4980 + }, + { + "epoch": 4.417884019477645, + "grad_norm": 0.8464101552963257, + "learning_rate": 0.0002, + "loss": 1.445, + "step": 4990 + }, + { + "epoch": 4.426737494466578, + "grad_norm": 0.85670405626297, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5000 + }, + { + "epoch": 4.435590969455511, + "grad_norm": 0.8656655550003052, + "learning_rate": 0.0002, + "loss": 1.4203, + "step": 5010 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7605292201042175, + "learning_rate": 0.0002, + "loss": 1.3426, + "step": 5020 + }, + { + "epoch": 4.4532979194333775, + "grad_norm": 0.7682471871376038, + "learning_rate": 0.0002, + "loss": 1.3803, + "step": 5030 + }, + { + "epoch": 4.46215139442231, + "grad_norm": 0.7209102511405945, + "learning_rate": 0.0002, + "loss": 1.3432, + "step": 5040 + }, + { + "epoch": 4.471004869411244, + "grad_norm": 0.8259989023208618, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 5050 + }, + { + "epoch": 4.479858344400177, + "grad_norm": 0.7342197895050049, + "learning_rate": 0.0002, + "loss": 1.3709, + "step": 5060 + }, + { + "epoch": 4.48871181938911, + "grad_norm": 0.7869040369987488, + "learning_rate": 0.0002, + "loss": 1.4196, + "step": 5070 + }, + { + "epoch": 4.4975652943780435, + "grad_norm": 0.7906143665313721, + "learning_rate": 0.0002, + "loss": 1.3734, + "step": 5080 + }, + { + "epoch": 4.506418769366976, + "grad_norm": 0.7336861491203308, + "learning_rate": 0.0002, + "loss": 1.3555, + "step": 5090 + }, + { + "epoch": 4.515272244355909, + "grad_norm": 0.8264166712760925, + "learning_rate": 0.0002, + "loss": 1.3768, + "step": 5100 + }, + { + "epoch": 4.524125719344843, + "grad_norm": 0.8144693970680237, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 5110 + }, + { + "epoch": 4.532979194333776, + "grad_norm": 0.8257269263267517, + "learning_rate": 0.0002, + "loss": 1.3044, + "step": 5120 + }, + { + "epoch": 4.541832669322709, + "grad_norm": 0.8838174343109131, + "learning_rate": 0.0002, + "loss": 1.3501, + "step": 5130 + }, + { + "epoch": 4.550686144311642, + "grad_norm": 0.7081145644187927, + "learning_rate": 0.0002, + "loss": 1.3464, + "step": 5140 + }, + { + "epoch": 4.559539619300575, + "grad_norm": 0.7137823700904846, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 5150 + }, + { + "epoch": 4.568393094289509, + "grad_norm": 0.7890386581420898, + "learning_rate": 0.0002, + "loss": 1.3788, + "step": 5160 + }, + { + "epoch": 4.577246569278442, + "grad_norm": 0.6418015360832214, + "learning_rate": 0.0002, + "loss": 1.3368, + "step": 5170 + }, + { + "epoch": 4.586100044267375, + "grad_norm": 0.768373966217041, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 5180 + }, + { + "epoch": 4.5949535192563085, + "grad_norm": 0.6934067606925964, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5190 + }, + { + "epoch": 4.603806994245241, + "grad_norm": 0.9430719017982483, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 5200 + }, + { + "epoch": 4.612660469234174, + "grad_norm": 0.880264163017273, + "learning_rate": 0.0002, + "loss": 1.3981, + "step": 5210 + }, + { + "epoch": 4.621513944223108, + "grad_norm": 0.7584623098373413, + "learning_rate": 0.0002, + "loss": 1.3506, + "step": 5220 + }, + { + "epoch": 4.630367419212041, + "grad_norm": 0.7974506616592407, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 5230 + }, + { + "epoch": 4.639220894200974, + "grad_norm": 0.8812133073806763, + "learning_rate": 0.0002, + "loss": 1.3818, + "step": 5240 + }, + { + "epoch": 4.648074369189907, + "grad_norm": 0.8968724012374878, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 5250 + }, + { + "epoch": 4.65692784417884, + "grad_norm": 0.7317764759063721, + "learning_rate": 0.0002, + "loss": 1.3327, + "step": 5260 + }, + { + "epoch": 4.665781319167773, + "grad_norm": 0.7415484189987183, + "learning_rate": 0.0002, + "loss": 1.4363, + "step": 5270 + }, + { + "epoch": 4.674634794156707, + "grad_norm": 0.7867009043693542, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 5280 + }, + { + "epoch": 4.68348826914564, + "grad_norm": 0.6895416378974915, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 5290 + }, + { + "epoch": 4.6923417441345725, + "grad_norm": 0.7324506640434265, + "learning_rate": 0.0002, + "loss": 1.3438, + "step": 5300 + }, + { + "epoch": 4.701195219123506, + "grad_norm": 0.7383193969726562, + "learning_rate": 0.0002, + "loss": 1.4072, + "step": 5310 + }, + { + "epoch": 4.710048694112439, + "grad_norm": 0.8254916071891785, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 5320 + }, + { + "epoch": 4.718902169101372, + "grad_norm": 0.8161033987998962, + "learning_rate": 0.0002, + "loss": 1.4317, + "step": 5330 + }, + { + "epoch": 4.727755644090306, + "grad_norm": 0.7664386034011841, + "learning_rate": 0.0002, + "loss": 1.3623, + "step": 5340 + }, + { + "epoch": 4.7366091190792385, + "grad_norm": 0.7465475797653198, + "learning_rate": 0.0002, + "loss": 1.4293, + "step": 5350 + }, + { + "epoch": 4.745462594068171, + "grad_norm": 0.7810078263282776, + "learning_rate": 0.0002, + "loss": 1.3435, + "step": 5360 + }, + { + "epoch": 4.754316069057105, + "grad_norm": 0.7428439855575562, + "learning_rate": 0.0002, + "loss": 1.4489, + "step": 5370 + }, + { + "epoch": 4.763169544046038, + "grad_norm": 0.9548320174217224, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 5380 + }, + { + "epoch": 4.772023019034972, + "grad_norm": 0.7959533333778381, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 5390 + }, + { + "epoch": 4.780876494023905, + "grad_norm": 0.747473418712616, + "learning_rate": 0.0002, + "loss": 1.3448, + "step": 5400 + }, + { + "epoch": 4.789729969012837, + "grad_norm": 0.7863122820854187, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 5410 + }, + { + "epoch": 4.798583444001771, + "grad_norm": 0.7769626379013062, + "learning_rate": 0.0002, + "loss": 1.4166, + "step": 5420 + }, + { + "epoch": 4.807436918990704, + "grad_norm": 0.8551191091537476, + "learning_rate": 0.0002, + "loss": 1.4484, + "step": 5430 + }, + { + "epoch": 4.816290393979637, + "grad_norm": 0.8364850878715515, + "learning_rate": 0.0002, + "loss": 1.4314, + "step": 5440 + }, + { + "epoch": 4.825143868968571, + "grad_norm": 0.7458856701850891, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 5450 + }, + { + "epoch": 4.8339973439575035, + "grad_norm": 0.7558291554450989, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 5460 + }, + { + "epoch": 4.842850818946436, + "grad_norm": 0.8396534323692322, + "learning_rate": 0.0002, + "loss": 1.3343, + "step": 5470 + }, + { + "epoch": 4.85170429393537, + "grad_norm": 0.7790794968605042, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 5480 + }, + { + "epoch": 4.860557768924303, + "grad_norm": 0.8607641458511353, + "learning_rate": 0.0002, + "loss": 1.406, + "step": 5490 + }, + { + "epoch": 4.869411243913236, + "grad_norm": 0.828134298324585, + "learning_rate": 0.0002, + "loss": 1.4011, + "step": 5500 + }, + { + "epoch": 4.8782647189021695, + "grad_norm": 0.8783106803894043, + "learning_rate": 0.0002, + "loss": 1.4089, + "step": 5510 + }, + { + "epoch": 4.887118193891102, + "grad_norm": 0.7476183176040649, + "learning_rate": 0.0002, + "loss": 1.4565, + "step": 5520 + }, + { + "epoch": 4.895971668880035, + "grad_norm": 0.8023254871368408, + "learning_rate": 0.0002, + "loss": 1.3974, + "step": 5530 + }, + { + "epoch": 4.904825143868969, + "grad_norm": 0.8021706938743591, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 5540 + }, + { + "epoch": 4.913678618857902, + "grad_norm": 0.7873618602752686, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 5550 + }, + { + "epoch": 4.922532093846835, + "grad_norm": 0.7181428670883179, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 5560 + }, + { + "epoch": 4.931385568835768, + "grad_norm": 0.7464273571968079, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 5570 + }, + { + "epoch": 4.940239043824701, + "grad_norm": 0.7433671355247498, + "learning_rate": 0.0002, + "loss": 1.3184, + "step": 5580 + }, + { + "epoch": 4.949092518813634, + "grad_norm": 0.7571114301681519, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 5590 + }, + { + "epoch": 4.957945993802568, + "grad_norm": 0.7811630964279175, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 5600 + }, + { + "epoch": 4.966799468791501, + "grad_norm": 0.7609148621559143, + "learning_rate": 0.0002, + "loss": 1.4288, + "step": 5610 + }, + { + "epoch": 4.9756529437804335, + "grad_norm": 0.7324382066726685, + "learning_rate": 0.0002, + "loss": 1.3786, + "step": 5620 + }, + { + "epoch": 4.984506418769367, + "grad_norm": 0.9249559640884399, + "learning_rate": 0.0002, + "loss": 1.4557, + "step": 5630 + }, + { + "epoch": 4.9933598937583, + "grad_norm": 0.7852522134780884, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 5640 + }, + { + "epoch": 4.999557326250553, + "eval_loss": 1.9384633302688599, + "eval_runtime": 82.6042, + "eval_samples_per_second": 6.235, + "eval_steps_per_second": 0.787, + "step": 5647 + }, + { + "epoch": 5.002213368747233, + "grad_norm": 0.8052749037742615, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 5650 + }, + { + "epoch": 5.011066843736167, + "grad_norm": 1.380603551864624, + "learning_rate": 0.0002, + "loss": 1.1967, + "step": 5660 + }, + { + "epoch": 5.0199203187251, + "grad_norm": 0.9197829365730286, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 5670 + }, + { + "epoch": 5.028773793714032, + "grad_norm": 0.9338570833206177, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 5680 + }, + { + "epoch": 5.037627268702966, + "grad_norm": 1.0464060306549072, + "learning_rate": 0.0002, + "loss": 1.1866, + "step": 5690 + }, + { + "epoch": 5.046480743691899, + "grad_norm": 0.9055638909339905, + "learning_rate": 0.0002, + "loss": 1.2211, + "step": 5700 + }, + { + "epoch": 5.055334218680832, + "grad_norm": 0.9494627714157104, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 5710 + }, + { + "epoch": 5.064187693669766, + "grad_norm": 0.9680962562561035, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 5720 + }, + { + "epoch": 5.0730411686586985, + "grad_norm": 1.0254695415496826, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 5730 + }, + { + "epoch": 5.081894643647631, + "grad_norm": 0.9306758642196655, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 5740 + }, + { + "epoch": 5.090748118636565, + "grad_norm": 1.0620356798171997, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 5750 + }, + { + "epoch": 5.099601593625498, + "grad_norm": 1.0401700735092163, + "learning_rate": 0.0002, + "loss": 1.2628, + "step": 5760 + }, + { + "epoch": 5.108455068614431, + "grad_norm": 0.9916906952857971, + "learning_rate": 0.0002, + "loss": 1.1976, + "step": 5770 + }, + { + "epoch": 5.1173085436033645, + "grad_norm": 0.8387252688407898, + "learning_rate": 0.0002, + "loss": 1.2847, + "step": 5780 + }, + { + "epoch": 5.126162018592297, + "grad_norm": 0.9870850443840027, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 5790 + }, + { + "epoch": 5.13501549358123, + "grad_norm": 0.9204064011573792, + "learning_rate": 0.0002, + "loss": 1.1902, + "step": 5800 + }, + { + "epoch": 5.143868968570164, + "grad_norm": 0.9951931834220886, + "learning_rate": 0.0002, + "loss": 1.2266, + "step": 5810 + }, + { + "epoch": 5.152722443559097, + "grad_norm": 0.9745809435844421, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 5820 + }, + { + "epoch": 5.16157591854803, + "grad_norm": 0.9467785954475403, + "learning_rate": 0.0002, + "loss": 1.2549, + "step": 5830 + }, + { + "epoch": 5.170429393536963, + "grad_norm": 1.0451668500900269, + "learning_rate": 0.0002, + "loss": 1.2309, + "step": 5840 + }, + { + "epoch": 5.179282868525896, + "grad_norm": 0.9740142822265625, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 5850 + }, + { + "epoch": 5.18813634351483, + "grad_norm": 1.2158266305923462, + "learning_rate": 0.0002, + "loss": 1.2137, + "step": 5860 + }, + { + "epoch": 5.196989818503763, + "grad_norm": 1.0795036554336548, + "learning_rate": 0.0002, + "loss": 1.1631, + "step": 5870 + }, + { + "epoch": 5.205843293492696, + "grad_norm": 0.9578470587730408, + "learning_rate": 0.0002, + "loss": 1.1448, + "step": 5880 + }, + { + "epoch": 5.214696768481629, + "grad_norm": 0.8887509703636169, + "learning_rate": 0.0002, + "loss": 1.2183, + "step": 5890 + }, + { + "epoch": 5.223550243470562, + "grad_norm": 1.171006441116333, + "learning_rate": 0.0002, + "loss": 1.1991, + "step": 5900 + }, + { + "epoch": 5.232403718459495, + "grad_norm": 0.9016029834747314, + "learning_rate": 0.0002, + "loss": 1.1781, + "step": 5910 + }, + { + "epoch": 5.241257193448429, + "grad_norm": 1.173136830329895, + "learning_rate": 0.0002, + "loss": 1.2057, + "step": 5920 + }, + { + "epoch": 5.250110668437362, + "grad_norm": 0.8760318160057068, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 5930 + }, + { + "epoch": 5.258964143426295, + "grad_norm": 0.8998854160308838, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 5940 + }, + { + "epoch": 5.267817618415228, + "grad_norm": 1.017175316810608, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 5950 + }, + { + "epoch": 5.276671093404161, + "grad_norm": 0.8646609783172607, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 5960 + }, + { + "epoch": 5.285524568393094, + "grad_norm": 1.0030627250671387, + "learning_rate": 0.0002, + "loss": 1.2044, + "step": 5970 + }, + { + "epoch": 5.294378043382028, + "grad_norm": 0.975911557674408, + "learning_rate": 0.0002, + "loss": 1.2365, + "step": 5980 + }, + { + "epoch": 5.303231518370961, + "grad_norm": 0.9576130509376526, + "learning_rate": 0.0002, + "loss": 1.2307, + "step": 5990 + }, + { + "epoch": 5.3120849933598935, + "grad_norm": 0.9566167593002319, + "learning_rate": 0.0002, + "loss": 1.2681, + "step": 6000 + }, + { + "epoch": 5.320938468348827, + "grad_norm": 0.9200350642204285, + "learning_rate": 0.0002, + "loss": 1.2029, + "step": 6010 + }, + { + "epoch": 5.32979194333776, + "grad_norm": 1.0491118431091309, + "learning_rate": 0.0002, + "loss": 1.1871, + "step": 6020 + }, + { + "epoch": 5.338645418326693, + "grad_norm": 1.1199153661727905, + "learning_rate": 0.0002, + "loss": 1.2531, + "step": 6030 + }, + { + "epoch": 5.347498893315627, + "grad_norm": 1.015252947807312, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 6040 + }, + { + "epoch": 5.3563523683045595, + "grad_norm": 1.1076666116714478, + "learning_rate": 0.0002, + "loss": 1.2208, + "step": 6050 + }, + { + "epoch": 5.365205843293492, + "grad_norm": 0.9224653840065002, + "learning_rate": 0.0002, + "loss": 1.1953, + "step": 6060 + }, + { + "epoch": 5.374059318282426, + "grad_norm": 1.0079779624938965, + "learning_rate": 0.0002, + "loss": 1.2045, + "step": 6070 + }, + { + "epoch": 5.382912793271359, + "grad_norm": 0.9627894759178162, + "learning_rate": 0.0002, + "loss": 1.2612, + "step": 6080 + }, + { + "epoch": 5.391766268260292, + "grad_norm": 1.0503166913986206, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 6090 + }, + { + "epoch": 5.400619743249226, + "grad_norm": 0.912736713886261, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 6100 + }, + { + "epoch": 5.409473218238158, + "grad_norm": 1.2552032470703125, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 6110 + }, + { + "epoch": 5.418326693227091, + "grad_norm": 0.986230731010437, + "learning_rate": 0.0002, + "loss": 1.2738, + "step": 6120 + }, + { + "epoch": 5.427180168216025, + "grad_norm": 0.9869757294654846, + "learning_rate": 0.0002, + "loss": 1.3301, + "step": 6130 + }, + { + "epoch": 5.436033643204958, + "grad_norm": 1.012027621269226, + "learning_rate": 0.0002, + "loss": 1.241, + "step": 6140 + }, + { + "epoch": 5.444887118193891, + "grad_norm": 0.8855568170547485, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 6150 + }, + { + "epoch": 5.4537405931828244, + "grad_norm": 1.1522414684295654, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6160 + }, + { + "epoch": 5.462594068171757, + "grad_norm": 1.2448474168777466, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 6170 + }, + { + "epoch": 5.471447543160691, + "grad_norm": 1.0362223386764526, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 6180 + }, + { + "epoch": 5.480301018149624, + "grad_norm": 0.9363031983375549, + "learning_rate": 0.0002, + "loss": 1.2351, + "step": 6190 + }, + { + "epoch": 5.489154493138557, + "grad_norm": 0.8852020502090454, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 6200 + }, + { + "epoch": 5.4980079681274905, + "grad_norm": 0.8577062487602234, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 6210 + }, + { + "epoch": 5.506861443116423, + "grad_norm": 0.9351891875267029, + "learning_rate": 0.0002, + "loss": 1.2547, + "step": 6220 + }, + { + "epoch": 5.515714918105356, + "grad_norm": 1.0031992197036743, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 6230 + }, + { + "epoch": 5.52456839309429, + "grad_norm": 0.9935104250907898, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 6240 + }, + { + "epoch": 5.533421868083223, + "grad_norm": 1.1086243391036987, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 6250 + }, + { + "epoch": 5.542275343072156, + "grad_norm": 0.990772545337677, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 6260 + }, + { + "epoch": 5.551128818061089, + "grad_norm": 0.9317597150802612, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 6270 + }, + { + "epoch": 5.559982293050022, + "grad_norm": 0.9657552242279053, + "learning_rate": 0.0002, + "loss": 1.2095, + "step": 6280 + }, + { + "epoch": 5.568835768038955, + "grad_norm": 1.0271565914154053, + "learning_rate": 0.0002, + "loss": 1.2435, + "step": 6290 + }, + { + "epoch": 5.577689243027889, + "grad_norm": 0.916253924369812, + "learning_rate": 0.0002, + "loss": 1.2283, + "step": 6300 + }, + { + "epoch": 5.586542718016822, + "grad_norm": 1.0083940029144287, + "learning_rate": 0.0002, + "loss": 1.2648, + "step": 6310 + }, + { + "epoch": 5.5953961930057545, + "grad_norm": 0.9740358591079712, + "learning_rate": 0.0002, + "loss": 1.2904, + "step": 6320 + }, + { + "epoch": 5.604249667994688, + "grad_norm": 0.9645405411720276, + "learning_rate": 0.0002, + "loss": 1.2507, + "step": 6330 + }, + { + "epoch": 5.613103142983621, + "grad_norm": 0.9677100777626038, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 6340 + }, + { + "epoch": 5.621956617972554, + "grad_norm": 0.9706602692604065, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 6350 + }, + { + "epoch": 5.630810092961488, + "grad_norm": 1.1492316722869873, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 6360 + }, + { + "epoch": 5.639663567950421, + "grad_norm": 0.8857277035713196, + "learning_rate": 0.0002, + "loss": 1.2242, + "step": 6370 + }, + { + "epoch": 5.648517042939353, + "grad_norm": 1.0363037586212158, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 6380 + }, + { + "epoch": 5.657370517928287, + "grad_norm": 0.9621800780296326, + "learning_rate": 0.0002, + "loss": 1.1838, + "step": 6390 + }, + { + "epoch": 5.66622399291722, + "grad_norm": 0.9937820434570312, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 6400 + }, + { + "epoch": 5.675077467906153, + "grad_norm": 0.9491283297538757, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6410 + }, + { + "epoch": 5.683930942895087, + "grad_norm": 0.9429448246955872, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6420 + }, + { + "epoch": 5.6927844178840195, + "grad_norm": 0.9808844327926636, + "learning_rate": 0.0002, + "loss": 1.1663, + "step": 6430 + }, + { + "epoch": 5.701637892872952, + "grad_norm": 0.8191056847572327, + "learning_rate": 0.0002, + "loss": 1.2574, + "step": 6440 + }, + { + "epoch": 5.710491367861886, + "grad_norm": 1.1118974685668945, + "learning_rate": 0.0002, + "loss": 1.2659, + "step": 6450 + }, + { + "epoch": 5.719344842850819, + "grad_norm": 0.9030969142913818, + "learning_rate": 0.0002, + "loss": 1.2192, + "step": 6460 + }, + { + "epoch": 5.728198317839752, + "grad_norm": 1.0509997606277466, + "learning_rate": 0.0002, + "loss": 1.301, + "step": 6470 + }, + { + "epoch": 5.7370517928286855, + "grad_norm": 1.0369981527328491, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 6480 + }, + { + "epoch": 5.745905267817618, + "grad_norm": 0.8626071214675903, + "learning_rate": 0.0002, + "loss": 1.2518, + "step": 6490 + }, + { + "epoch": 5.754758742806551, + "grad_norm": 1.0448849201202393, + "learning_rate": 0.0002, + "loss": 1.2446, + "step": 6500 + }, + { + "epoch": 5.763612217795485, + "grad_norm": 0.9333119988441467, + "learning_rate": 0.0002, + "loss": 1.2698, + "step": 6510 + }, + { + "epoch": 5.772465692784418, + "grad_norm": 0.8533532023429871, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 6520 + }, + { + "epoch": 5.781319167773351, + "grad_norm": 0.9774261713027954, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6530 + }, + { + "epoch": 5.790172642762284, + "grad_norm": 0.9841071963310242, + "learning_rate": 0.0002, + "loss": 1.2031, + "step": 6540 + }, + { + "epoch": 5.799026117751217, + "grad_norm": 0.9891805052757263, + "learning_rate": 0.0002, + "loss": 1.2767, + "step": 6550 + }, + { + "epoch": 5.80787959274015, + "grad_norm": 0.9633952379226685, + "learning_rate": 0.0002, + "loss": 1.3373, + "step": 6560 + }, + { + "epoch": 5.816733067729084, + "grad_norm": 1.327634334564209, + "learning_rate": 0.0002, + "loss": 1.1939, + "step": 6570 + }, + { + "epoch": 5.825586542718017, + "grad_norm": 0.9805197715759277, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 6580 + }, + { + "epoch": 5.8344400177069495, + "grad_norm": 1.020957589149475, + "learning_rate": 0.0002, + "loss": 1.1933, + "step": 6590 + }, + { + "epoch": 5.843293492695883, + "grad_norm": 0.9694032669067383, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6600 + }, + { + "epoch": 5.852146967684816, + "grad_norm": 0.8980914354324341, + "learning_rate": 0.0002, + "loss": 1.2671, + "step": 6610 + }, + { + "epoch": 5.861000442673749, + "grad_norm": 0.8312330842018127, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 6620 + }, + { + "epoch": 5.869853917662683, + "grad_norm": 0.9773725271224976, + "learning_rate": 0.0002, + "loss": 1.3301, + "step": 6630 + }, + { + "epoch": 5.878707392651616, + "grad_norm": 0.9684233665466309, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 6640 + }, + { + "epoch": 5.887560867640548, + "grad_norm": 0.8436519503593445, + "learning_rate": 0.0002, + "loss": 1.2866, + "step": 6650 + }, + { + "epoch": 5.896414342629482, + "grad_norm": 0.9129888415336609, + "learning_rate": 0.0002, + "loss": 1.2213, + "step": 6660 + }, + { + "epoch": 5.905267817618415, + "grad_norm": 0.8871369957923889, + "learning_rate": 0.0002, + "loss": 1.3272, + "step": 6670 + }, + { + "epoch": 5.914121292607349, + "grad_norm": 0.9544420838356018, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 6680 + }, + { + "epoch": 5.922974767596282, + "grad_norm": 0.9607448577880859, + "learning_rate": 0.0002, + "loss": 1.2954, + "step": 6690 + }, + { + "epoch": 5.9318282425852145, + "grad_norm": 0.9675708413124084, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 6700 + }, + { + "epoch": 5.940681717574148, + "grad_norm": 0.9373534321784973, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 6710 + }, + { + "epoch": 5.949535192563081, + "grad_norm": 0.9750351905822754, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 6720 + }, + { + "epoch": 5.958388667552014, + "grad_norm": 0.9122727513313293, + "learning_rate": 0.0002, + "loss": 1.2575, + "step": 6730 + }, + { + "epoch": 5.967242142540948, + "grad_norm": 0.9300726652145386, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 6740 + }, + { + "epoch": 5.9760956175298805, + "grad_norm": 0.972944438457489, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 6750 + }, + { + "epoch": 5.984949092518813, + "grad_norm": 1.2385832071304321, + "learning_rate": 0.0002, + "loss": 1.3252, + "step": 6760 + }, + { + "epoch": 5.993802567507747, + "grad_norm": 0.9080338478088379, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 6770 + }, + { + "epoch": 6.0, + "eval_loss": 2.062002658843994, + "eval_runtime": 83.2814, + "eval_samples_per_second": 6.184, + "eval_steps_per_second": 0.78, + "step": 6777 + }, + { + "epoch": 6.00265604249668, + "grad_norm": 0.8741096258163452, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 6780 + }, + { + "epoch": 6.011509517485613, + "grad_norm": 1.2510347366333008, + "learning_rate": 0.0002, + "loss": 1.1242, + "step": 6790 + }, + { + "epoch": 6.0203629924745465, + "grad_norm": 1.063910722732544, + "learning_rate": 0.0002, + "loss": 1.0269, + "step": 6800 + }, + { + "epoch": 6.029216467463479, + "grad_norm": 1.169573187828064, + "learning_rate": 0.0002, + "loss": 1.0468, + "step": 6810 + }, + { + "epoch": 6.038069942452412, + "grad_norm": 1.0453242063522339, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 6820 + }, + { + "epoch": 6.046923417441346, + "grad_norm": 1.1960445642471313, + "learning_rate": 0.0002, + "loss": 1.0469, + "step": 6830 + }, + { + "epoch": 6.055776892430279, + "grad_norm": 0.9427650570869446, + "learning_rate": 0.0002, + "loss": 1.1233, + "step": 6840 + }, + { + "epoch": 6.064630367419212, + "grad_norm": 1.2107350826263428, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 6850 + }, + { + "epoch": 6.073483842408145, + "grad_norm": 1.262130856513977, + "learning_rate": 0.0002, + "loss": 1.0751, + "step": 6860 + }, + { + "epoch": 6.082337317397078, + "grad_norm": 1.1628082990646362, + "learning_rate": 0.0002, + "loss": 1.0787, + "step": 6870 + }, + { + "epoch": 6.091190792386011, + "grad_norm": 1.0090514421463013, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 6880 + }, + { + "epoch": 6.100044267374945, + "grad_norm": 1.5029802322387695, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 6890 + }, + { + "epoch": 6.108897742363878, + "grad_norm": 1.0522133111953735, + "learning_rate": 0.0002, + "loss": 1.0549, + "step": 6900 + }, + { + "epoch": 6.117751217352811, + "grad_norm": 1.225534439086914, + "learning_rate": 0.0002, + "loss": 1.0502, + "step": 6910 + }, + { + "epoch": 6.126604692341744, + "grad_norm": 1.2859058380126953, + "learning_rate": 0.0002, + "loss": 1.0808, + "step": 6920 + }, + { + "epoch": 6.135458167330677, + "grad_norm": 1.215205192565918, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 6930 + }, + { + "epoch": 6.14431164231961, + "grad_norm": 1.1799274682998657, + "learning_rate": 0.0002, + "loss": 1.1442, + "step": 6940 + }, + { + "epoch": 6.153165117308544, + "grad_norm": 1.2553550004959106, + "learning_rate": 0.0002, + "loss": 1.0749, + "step": 6950 + }, + { + "epoch": 6.162018592297477, + "grad_norm": 1.2171931266784668, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 6960 + }, + { + "epoch": 6.17087206728641, + "grad_norm": 1.1896923780441284, + "learning_rate": 0.0002, + "loss": 1.0579, + "step": 6970 + }, + { + "epoch": 6.179725542275343, + "grad_norm": 1.007250189781189, + "learning_rate": 0.0002, + "loss": 1.1477, + "step": 6980 + }, + { + "epoch": 6.188579017264276, + "grad_norm": 1.2109580039978027, + "learning_rate": 0.0002, + "loss": 1.1551, + "step": 6990 + }, + { + "epoch": 6.19743249225321, + "grad_norm": 1.2197009325027466, + "learning_rate": 0.0002, + "loss": 1.0809, + "step": 7000 + }, + { + "epoch": 6.206285967242143, + "grad_norm": 1.1417629718780518, + "learning_rate": 0.0002, + "loss": 1.1322, + "step": 7010 + }, + { + "epoch": 6.2151394422310755, + "grad_norm": 1.2337356805801392, + "learning_rate": 0.0002, + "loss": 1.0541, + "step": 7020 + }, + { + "epoch": 6.223992917220009, + "grad_norm": 1.1230454444885254, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 7030 + }, + { + "epoch": 6.232846392208942, + "grad_norm": 1.0634387731552124, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 7040 + }, + { + "epoch": 6.241699867197875, + "grad_norm": 1.1566855907440186, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 7050 + }, + { + "epoch": 6.250553342186809, + "grad_norm": 1.2251075506210327, + "learning_rate": 0.0002, + "loss": 1.063, + "step": 7060 + }, + { + "epoch": 6.2594068171757415, + "grad_norm": 1.2232472896575928, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 7070 + }, + { + "epoch": 6.268260292164674, + "grad_norm": 1.1014091968536377, + "learning_rate": 0.0002, + "loss": 1.0394, + "step": 7080 + }, + { + "epoch": 6.277113767153608, + "grad_norm": 1.322811245918274, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 7090 + }, + { + "epoch": 6.285967242142541, + "grad_norm": 0.9820072650909424, + "learning_rate": 0.0002, + "loss": 1.1108, + "step": 7100 + }, + { + "epoch": 6.294820717131474, + "grad_norm": 1.13047456741333, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 7110 + }, + { + "epoch": 6.303674192120408, + "grad_norm": 1.145127534866333, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 7120 + }, + { + "epoch": 6.31252766710934, + "grad_norm": 1.101465106010437, + "learning_rate": 0.0002, + "loss": 1.089, + "step": 7130 + }, + { + "epoch": 6.321381142098273, + "grad_norm": 1.131705641746521, + "learning_rate": 0.0002, + "loss": 1.1122, + "step": 7140 + }, + { + "epoch": 6.330234617087207, + "grad_norm": 0.9876824617385864, + "learning_rate": 0.0002, + "loss": 1.0173, + "step": 7150 + }, + { + "epoch": 6.33908809207614, + "grad_norm": 1.2950096130371094, + "learning_rate": 0.0002, + "loss": 1.0184, + "step": 7160 + }, + { + "epoch": 6.347941567065073, + "grad_norm": 1.0496132373809814, + "learning_rate": 0.0002, + "loss": 1.0559, + "step": 7170 + }, + { + "epoch": 6.3567950420540065, + "grad_norm": 1.3835711479187012, + "learning_rate": 0.0002, + "loss": 1.1334, + "step": 7180 + }, + { + "epoch": 6.365648517042939, + "grad_norm": 1.176424503326416, + "learning_rate": 0.0002, + "loss": 0.9777, + "step": 7190 + }, + { + "epoch": 6.374501992031872, + "grad_norm": 1.3502846956253052, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 7200 + }, + { + "epoch": 6.383355467020806, + "grad_norm": 1.2429769039154053, + "learning_rate": 0.0002, + "loss": 1.0614, + "step": 7210 + }, + { + "epoch": 6.392208942009739, + "grad_norm": 1.138015866279602, + "learning_rate": 0.0002, + "loss": 1.1712, + "step": 7220 + }, + { + "epoch": 6.401062416998672, + "grad_norm": 1.4407539367675781, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 7230 + }, + { + "epoch": 6.409915891987605, + "grad_norm": 1.1464104652404785, + "learning_rate": 0.0002, + "loss": 1.1595, + "step": 7240 + }, + { + "epoch": 6.418769366976538, + "grad_norm": 1.2028888463974, + "learning_rate": 0.0002, + "loss": 1.1381, + "step": 7250 + }, + { + "epoch": 6.427622841965471, + "grad_norm": 1.132938027381897, + "learning_rate": 0.0002, + "loss": 1.1129, + "step": 7260 + }, + { + "epoch": 6.436476316954405, + "grad_norm": 1.2005301713943481, + "learning_rate": 0.0002, + "loss": 1.0662, + "step": 7270 + }, + { + "epoch": 6.445329791943338, + "grad_norm": 1.0460501909255981, + "learning_rate": 0.0002, + "loss": 1.0538, + "step": 7280 + }, + { + "epoch": 6.4541832669322705, + "grad_norm": 1.1363240480422974, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 7290 + }, + { + "epoch": 6.463036741921204, + "grad_norm": 1.0439460277557373, + "learning_rate": 0.0002, + "loss": 1.1042, + "step": 7300 + }, + { + "epoch": 6.471890216910137, + "grad_norm": 1.1968905925750732, + "learning_rate": 0.0002, + "loss": 1.0896, + "step": 7310 + }, + { + "epoch": 6.48074369189907, + "grad_norm": 1.0443525314331055, + "learning_rate": 0.0002, + "loss": 1.0891, + "step": 7320 + }, + { + "epoch": 6.489597166888004, + "grad_norm": 1.2550246715545654, + "learning_rate": 0.0002, + "loss": 1.1384, + "step": 7330 + }, + { + "epoch": 6.4984506418769366, + "grad_norm": 1.2880409955978394, + "learning_rate": 0.0002, + "loss": 1.2028, + "step": 7340 + }, + { + "epoch": 6.507304116865869, + "grad_norm": 1.2390265464782715, + "learning_rate": 0.0002, + "loss": 1.1173, + "step": 7350 + }, + { + "epoch": 6.516157591854803, + "grad_norm": 1.0650159120559692, + "learning_rate": 0.0002, + "loss": 1.065, + "step": 7360 + }, + { + "epoch": 6.525011066843736, + "grad_norm": 1.4934154748916626, + "learning_rate": 0.0002, + "loss": 1.1072, + "step": 7370 + }, + { + "epoch": 6.533864541832669, + "grad_norm": 1.0902682542800903, + "learning_rate": 0.0002, + "loss": 1.0436, + "step": 7380 + }, + { + "epoch": 6.542718016821603, + "grad_norm": 1.1561789512634277, + "learning_rate": 0.0002, + "loss": 1.145, + "step": 7390 + }, + { + "epoch": 6.551571491810535, + "grad_norm": 1.1010485887527466, + "learning_rate": 0.0002, + "loss": 1.1633, + "step": 7400 + }, + { + "epoch": 6.560424966799468, + "grad_norm": 1.1616493463516235, + "learning_rate": 0.0002, + "loss": 1.1063, + "step": 7410 + }, + { + "epoch": 6.569278441788402, + "grad_norm": 1.2321627140045166, + "learning_rate": 0.0002, + "loss": 1.1217, + "step": 7420 + }, + { + "epoch": 6.578131916777335, + "grad_norm": 1.162299394607544, + "learning_rate": 0.0002, + "loss": 1.135, + "step": 7430 + }, + { + "epoch": 6.586985391766269, + "grad_norm": 0.9935213923454285, + "learning_rate": 0.0002, + "loss": 1.1785, + "step": 7440 + }, + { + "epoch": 6.5958388667552015, + "grad_norm": 1.3035451173782349, + "learning_rate": 0.0002, + "loss": 1.078, + "step": 7450 + }, + { + "epoch": 6.604692341744134, + "grad_norm": 1.0957173109054565, + "learning_rate": 0.0002, + "loss": 1.1377, + "step": 7460 + }, + { + "epoch": 6.613545816733068, + "grad_norm": 1.166472315788269, + "learning_rate": 0.0002, + "loss": 1.1882, + "step": 7470 + }, + { + "epoch": 6.622399291722001, + "grad_norm": 1.332716464996338, + "learning_rate": 0.0002, + "loss": 1.1379, + "step": 7480 + }, + { + "epoch": 6.631252766710934, + "grad_norm": 1.1008102893829346, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 7490 + }, + { + "epoch": 6.6401062416998675, + "grad_norm": 1.4472310543060303, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 7500 + }, + { + "epoch": 6.6489597166888, + "grad_norm": 1.1247508525848389, + "learning_rate": 0.0002, + "loss": 1.1729, + "step": 7510 + }, + { + "epoch": 6.657813191677733, + "grad_norm": 1.297936201095581, + "learning_rate": 0.0002, + "loss": 1.1649, + "step": 7520 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.0784718990325928, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 7530 + }, + { + "epoch": 6.6755201416556, + "grad_norm": 1.1518864631652832, + "learning_rate": 0.0002, + "loss": 1.0852, + "step": 7540 + }, + { + "epoch": 6.684373616644533, + "grad_norm": 1.1135684251785278, + "learning_rate": 0.0002, + "loss": 1.1611, + "step": 7550 + }, + { + "epoch": 6.693227091633466, + "grad_norm": 1.0792579650878906, + "learning_rate": 0.0002, + "loss": 1.1257, + "step": 7560 + }, + { + "epoch": 6.702080566622399, + "grad_norm": 1.1826539039611816, + "learning_rate": 0.0002, + "loss": 1.1466, + "step": 7570 + }, + { + "epoch": 6.710934041611332, + "grad_norm": 1.1485552787780762, + "learning_rate": 0.0002, + "loss": 1.0874, + "step": 7580 + }, + { + "epoch": 6.719787516600266, + "grad_norm": 1.090723991394043, + "learning_rate": 0.0002, + "loss": 1.0502, + "step": 7590 + }, + { + "epoch": 6.728640991589199, + "grad_norm": 1.105883002281189, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 7600 + }, + { + "epoch": 6.737494466578132, + "grad_norm": 1.3093862533569336, + "learning_rate": 0.0002, + "loss": 1.1101, + "step": 7610 + }, + { + "epoch": 6.746347941567065, + "grad_norm": 1.0273808240890503, + "learning_rate": 0.0002, + "loss": 1.1202, + "step": 7620 + }, + { + "epoch": 6.755201416555998, + "grad_norm": 1.3253363370895386, + "learning_rate": 0.0002, + "loss": 1.2071, + "step": 7630 + }, + { + "epoch": 6.764054891544931, + "grad_norm": 1.1979365348815918, + "learning_rate": 0.0002, + "loss": 1.0833, + "step": 7640 + }, + { + "epoch": 6.772908366533865, + "grad_norm": 1.123506784439087, + "learning_rate": 0.0002, + "loss": 1.1208, + "step": 7650 + }, + { + "epoch": 6.781761841522798, + "grad_norm": 1.3928422927856445, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 7660 + }, + { + "epoch": 6.790615316511731, + "grad_norm": 1.1540825366973877, + "learning_rate": 0.0002, + "loss": 1.1535, + "step": 7670 + }, + { + "epoch": 6.799468791500664, + "grad_norm": 1.0836732387542725, + "learning_rate": 0.0002, + "loss": 1.1053, + "step": 7680 + }, + { + "epoch": 6.808322266489597, + "grad_norm": 1.0360240936279297, + "learning_rate": 0.0002, + "loss": 1.1049, + "step": 7690 + }, + { + "epoch": 6.817175741478531, + "grad_norm": 1.2440129518508911, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 7700 + }, + { + "epoch": 6.826029216467464, + "grad_norm": 1.1702594757080078, + "learning_rate": 0.0002, + "loss": 1.1245, + "step": 7710 + }, + { + "epoch": 6.8348826914563965, + "grad_norm": 1.0726280212402344, + "learning_rate": 0.0002, + "loss": 1.1021, + "step": 7720 + }, + { + "epoch": 6.84373616644533, + "grad_norm": 0.9410907030105591, + "learning_rate": 0.0002, + "loss": 1.1471, + "step": 7730 + }, + { + "epoch": 6.852589641434263, + "grad_norm": 1.042914867401123, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 7740 + }, + { + "epoch": 6.861443116423196, + "grad_norm": 1.1028170585632324, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 7750 + }, + { + "epoch": 6.87029659141213, + "grad_norm": 1.0990355014801025, + "learning_rate": 0.0002, + "loss": 1.0759, + "step": 7760 + }, + { + "epoch": 6.8791500664010625, + "grad_norm": 1.2572479248046875, + "learning_rate": 0.0002, + "loss": 1.1508, + "step": 7770 + }, + { + "epoch": 6.888003541389995, + "grad_norm": 1.250198483467102, + "learning_rate": 0.0002, + "loss": 1.1749, + "step": 7780 + }, + { + "epoch": 6.896857016378929, + "grad_norm": 1.1872532367706299, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 7790 + }, + { + "epoch": 6.905710491367862, + "grad_norm": 1.5275602340698242, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 7800 + }, + { + "epoch": 6.914563966356795, + "grad_norm": 1.015166163444519, + "learning_rate": 0.0002, + "loss": 1.0712, + "step": 7810 + }, + { + "epoch": 6.923417441345729, + "grad_norm": 1.3205344676971436, + "learning_rate": 0.0002, + "loss": 1.1931, + "step": 7820 + }, + { + "epoch": 6.932270916334661, + "grad_norm": 1.1329596042633057, + "learning_rate": 0.0002, + "loss": 1.222, + "step": 7830 + }, + { + "epoch": 6.941124391323594, + "grad_norm": 1.1614333391189575, + "learning_rate": 0.0002, + "loss": 1.1207, + "step": 7840 + }, + { + "epoch": 6.949977866312528, + "grad_norm": 1.3472208976745605, + "learning_rate": 0.0002, + "loss": 1.2127, + "step": 7850 + }, + { + "epoch": 6.958831341301461, + "grad_norm": 1.1490193605422974, + "learning_rate": 0.0002, + "loss": 1.1002, + "step": 7860 + }, + { + "epoch": 6.967684816290394, + "grad_norm": 1.1343097686767578, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 7870 + }, + { + "epoch": 6.9765382912793275, + "grad_norm": 1.2555341720581055, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 7880 + }, + { + "epoch": 6.98539176626826, + "grad_norm": 1.2695735692977905, + "learning_rate": 0.0002, + "loss": 1.0955, + "step": 7890 + }, + { + "epoch": 6.994245241257193, + "grad_norm": 1.1662464141845703, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 7900 + }, + { + "epoch": 6.999557326250553, + "eval_loss": 2.148611068725586, + "eval_runtime": 82.53, + "eval_samples_per_second": 6.24, + "eval_steps_per_second": 0.788, + "step": 7906 + }, + { + "epoch": 7.003098716246127, + "grad_norm": 1.0013059377670288, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 7910 + }, + { + "epoch": 7.01195219123506, + "grad_norm": 1.317168951034546, + "learning_rate": 0.0002, + "loss": 0.962, + "step": 7920 + }, + { + "epoch": 7.020805666223993, + "grad_norm": 1.2173038721084595, + "learning_rate": 0.0002, + "loss": 0.9373, + "step": 7930 + }, + { + "epoch": 7.029659141212926, + "grad_norm": 1.5555535554885864, + "learning_rate": 0.0002, + "loss": 0.9371, + "step": 7940 + }, + { + "epoch": 7.038512616201859, + "grad_norm": 1.1929986476898193, + "learning_rate": 0.0002, + "loss": 0.8791, + "step": 7950 + }, + { + "epoch": 7.047366091190792, + "grad_norm": 1.3552240133285522, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 7960 + }, + { + "epoch": 7.056219566179726, + "grad_norm": 1.3692620992660522, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 7970 + }, + { + "epoch": 7.065073041168659, + "grad_norm": 1.4173164367675781, + "learning_rate": 0.0002, + "loss": 0.9342, + "step": 7980 + }, + { + "epoch": 7.0739265161575915, + "grad_norm": 1.2271063327789307, + "learning_rate": 0.0002, + "loss": 0.8799, + "step": 7990 + }, + { + "epoch": 7.082779991146525, + "grad_norm": 1.4002584218978882, + "learning_rate": 0.0002, + "loss": 0.9586, + "step": 8000 + }, + { + "epoch": 7.091633466135458, + "grad_norm": 1.345386028289795, + "learning_rate": 0.0002, + "loss": 0.9682, + "step": 8010 + }, + { + "epoch": 7.100486941124391, + "grad_norm": 1.3328183889389038, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 8020 + }, + { + "epoch": 7.109340416113325, + "grad_norm": 1.1148749589920044, + "learning_rate": 0.0002, + "loss": 0.9408, + "step": 8030 + }, + { + "epoch": 7.1181938911022575, + "grad_norm": 1.316633939743042, + "learning_rate": 0.0002, + "loss": 0.8894, + "step": 8040 + }, + { + "epoch": 7.12704736609119, + "grad_norm": 1.2247374057769775, + "learning_rate": 0.0002, + "loss": 0.9547, + "step": 8050 + }, + { + "epoch": 7.135900841080124, + "grad_norm": 1.3124101161956787, + "learning_rate": 0.0002, + "loss": 0.9495, + "step": 8060 + }, + { + "epoch": 7.144754316069057, + "grad_norm": 1.3420861959457397, + "learning_rate": 0.0002, + "loss": 0.9922, + "step": 8070 + }, + { + "epoch": 7.15360779105799, + "grad_norm": 1.2799710035324097, + "learning_rate": 0.0002, + "loss": 0.9626, + "step": 8080 + }, + { + "epoch": 7.162461266046924, + "grad_norm": 1.3490463495254517, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 8090 + }, + { + "epoch": 7.171314741035856, + "grad_norm": 1.444670557975769, + "learning_rate": 0.0002, + "loss": 1.0247, + "step": 8100 + }, + { + "epoch": 7.180168216024789, + "grad_norm": 1.2264536619186401, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 8110 + }, + { + "epoch": 7.189021691013723, + "grad_norm": 1.2793710231781006, + "learning_rate": 0.0002, + "loss": 1.0122, + "step": 8120 + }, + { + "epoch": 7.197875166002656, + "grad_norm": 1.3160685300827026, + "learning_rate": 0.0002, + "loss": 0.9325, + "step": 8130 + }, + { + "epoch": 7.20672864099159, + "grad_norm": 1.289884090423584, + "learning_rate": 0.0002, + "loss": 1.0383, + "step": 8140 + }, + { + "epoch": 7.2155821159805225, + "grad_norm": 1.6820887327194214, + "learning_rate": 0.0002, + "loss": 0.9422, + "step": 8150 + }, + { + "epoch": 7.224435590969455, + "grad_norm": 1.403016209602356, + "learning_rate": 0.0002, + "loss": 0.9301, + "step": 8160 + }, + { + "epoch": 7.233289065958389, + "grad_norm": 1.3833755254745483, + "learning_rate": 0.0002, + "loss": 0.9361, + "step": 8170 + }, + { + "epoch": 7.242142540947322, + "grad_norm": 1.547101616859436, + "learning_rate": 0.0002, + "loss": 0.9408, + "step": 8180 + }, + { + "epoch": 7.250996015936255, + "grad_norm": 1.3376225233078003, + "learning_rate": 0.0002, + "loss": 0.9192, + "step": 8190 + }, + { + "epoch": 7.2598494909251885, + "grad_norm": 1.3008460998535156, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 8200 + }, + { + "epoch": 7.268702965914121, + "grad_norm": 1.3364465236663818, + "learning_rate": 0.0002, + "loss": 0.98, + "step": 8210 + }, + { + "epoch": 7.277556440903054, + "grad_norm": 1.3967384099960327, + "learning_rate": 0.0002, + "loss": 0.934, + "step": 8220 + }, + { + "epoch": 7.286409915891988, + "grad_norm": 1.538851022720337, + "learning_rate": 0.0002, + "loss": 0.9587, + "step": 8230 + }, + { + "epoch": 7.295263390880921, + "grad_norm": 1.6243304014205933, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 8240 + }, + { + "epoch": 7.304116865869854, + "grad_norm": 1.6250357627868652, + "learning_rate": 0.0002, + "loss": 0.9748, + "step": 8250 + }, + { + "epoch": 7.312970340858787, + "grad_norm": 1.361752986907959, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 8260 + }, + { + "epoch": 7.32182381584772, + "grad_norm": 1.4158686399459839, + "learning_rate": 0.0002, + "loss": 0.883, + "step": 8270 + }, + { + "epoch": 7.330677290836653, + "grad_norm": 1.4000667333602905, + "learning_rate": 0.0002, + "loss": 0.9915, + "step": 8280 + }, + { + "epoch": 7.339530765825587, + "grad_norm": 1.293979287147522, + "learning_rate": 0.0002, + "loss": 0.9323, + "step": 8290 + }, + { + "epoch": 7.34838424081452, + "grad_norm": 1.3639771938323975, + "learning_rate": 0.0002, + "loss": 0.9544, + "step": 8300 + }, + { + "epoch": 7.3572377158034525, + "grad_norm": 1.426788091659546, + "learning_rate": 0.0002, + "loss": 0.9925, + "step": 8310 + }, + { + "epoch": 7.366091190792386, + "grad_norm": 1.3375388383865356, + "learning_rate": 0.0002, + "loss": 0.9162, + "step": 8320 + }, + { + "epoch": 7.374944665781319, + "grad_norm": 1.2612264156341553, + "learning_rate": 0.0002, + "loss": 1.032, + "step": 8330 + }, + { + "epoch": 7.383798140770252, + "grad_norm": 1.431223750114441, + "learning_rate": 0.0002, + "loss": 0.9185, + "step": 8340 + }, + { + "epoch": 7.392651615759186, + "grad_norm": 1.4454351663589478, + "learning_rate": 0.0002, + "loss": 1.0072, + "step": 8350 + }, + { + "epoch": 7.401505090748119, + "grad_norm": 1.3863321542739868, + "learning_rate": 0.0002, + "loss": 1.0231, + "step": 8360 + }, + { + "epoch": 7.410358565737051, + "grad_norm": 1.2186199426651, + "learning_rate": 0.0002, + "loss": 1.0194, + "step": 8370 + }, + { + "epoch": 7.419212040725985, + "grad_norm": 1.338301181793213, + "learning_rate": 0.0002, + "loss": 1.0192, + "step": 8380 + }, + { + "epoch": 7.428065515714918, + "grad_norm": 1.4814497232437134, + "learning_rate": 0.0002, + "loss": 0.999, + "step": 8390 + }, + { + "epoch": 7.436918990703851, + "grad_norm": 1.430943489074707, + "learning_rate": 0.0002, + "loss": 0.9766, + "step": 8400 + }, + { + "epoch": 7.445772465692785, + "grad_norm": 1.215942621231079, + "learning_rate": 0.0002, + "loss": 1.0268, + "step": 8410 + }, + { + "epoch": 7.4546259406817175, + "grad_norm": 1.381890892982483, + "learning_rate": 0.0002, + "loss": 0.9708, + "step": 8420 + }, + { + "epoch": 7.46347941567065, + "grad_norm": 1.390587568283081, + "learning_rate": 0.0002, + "loss": 0.9153, + "step": 8430 + }, + { + "epoch": 7.472332890659584, + "grad_norm": 1.6421098709106445, + "learning_rate": 0.0002, + "loss": 0.9696, + "step": 8440 + }, + { + "epoch": 7.481186365648517, + "grad_norm": 1.43213951587677, + "learning_rate": 0.0002, + "loss": 1.032, + "step": 8450 + }, + { + "epoch": 7.490039840637451, + "grad_norm": 1.3095251321792603, + "learning_rate": 0.0002, + "loss": 0.9545, + "step": 8460 + }, + { + "epoch": 7.4988933156263835, + "grad_norm": 1.4996658563613892, + "learning_rate": 0.0002, + "loss": 1.0101, + "step": 8470 + }, + { + "epoch": 7.507746790615316, + "grad_norm": 1.2955113649368286, + "learning_rate": 0.0002, + "loss": 0.9638, + "step": 8480 + }, + { + "epoch": 7.51660026560425, + "grad_norm": 1.3235514163970947, + "learning_rate": 0.0002, + "loss": 1.0388, + "step": 8490 + }, + { + "epoch": 7.525453740593183, + "grad_norm": 1.408852219581604, + "learning_rate": 0.0002, + "loss": 1.014, + "step": 8500 + }, + { + "epoch": 7.534307215582116, + "grad_norm": 1.4187248945236206, + "learning_rate": 0.0002, + "loss": 0.9258, + "step": 8510 + }, + { + "epoch": 7.5431606905710495, + "grad_norm": 1.2473978996276855, + "learning_rate": 0.0002, + "loss": 0.9565, + "step": 8520 + }, + { + "epoch": 7.552014165559982, + "grad_norm": 1.2394654750823975, + "learning_rate": 0.0002, + "loss": 1.001, + "step": 8530 + }, + { + "epoch": 7.560867640548915, + "grad_norm": 1.383175253868103, + "learning_rate": 0.0002, + "loss": 0.9701, + "step": 8540 + }, + { + "epoch": 7.569721115537849, + "grad_norm": 1.4113128185272217, + "learning_rate": 0.0002, + "loss": 0.9309, + "step": 8550 + }, + { + "epoch": 7.578574590526782, + "grad_norm": 1.4652873277664185, + "learning_rate": 0.0002, + "loss": 0.9908, + "step": 8560 + }, + { + "epoch": 7.587428065515715, + "grad_norm": 1.3373491764068604, + "learning_rate": 0.0002, + "loss": 1.0479, + "step": 8570 + }, + { + "epoch": 7.596281540504648, + "grad_norm": 1.2278908491134644, + "learning_rate": 0.0002, + "loss": 0.9994, + "step": 8580 + }, + { + "epoch": 7.605135015493581, + "grad_norm": 1.3615998029708862, + "learning_rate": 0.0002, + "loss": 0.9934, + "step": 8590 + }, + { + "epoch": 7.613988490482514, + "grad_norm": 1.5927653312683105, + "learning_rate": 0.0002, + "loss": 1.0324, + "step": 8600 + }, + { + "epoch": 7.622841965471448, + "grad_norm": 1.4127552509307861, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 8610 + }, + { + "epoch": 7.631695440460381, + "grad_norm": 1.276419997215271, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 8620 + }, + { + "epoch": 7.640548915449314, + "grad_norm": 1.3077269792556763, + "learning_rate": 0.0002, + "loss": 1.0321, + "step": 8630 + }, + { + "epoch": 7.649402390438247, + "grad_norm": 1.449960470199585, + "learning_rate": 0.0002, + "loss": 1.0571, + "step": 8640 + }, + { + "epoch": 7.65825586542718, + "grad_norm": 1.4388965368270874, + "learning_rate": 0.0002, + "loss": 1.0317, + "step": 8650 + }, + { + "epoch": 7.667109340416113, + "grad_norm": 1.4241976737976074, + "learning_rate": 0.0002, + "loss": 1.0161, + "step": 8660 + }, + { + "epoch": 7.675962815405047, + "grad_norm": 1.2062371969223022, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 8670 + }, + { + "epoch": 7.68481629039398, + "grad_norm": 1.288986325263977, + "learning_rate": 0.0002, + "loss": 1.0034, + "step": 8680 + }, + { + "epoch": 7.6936697653829125, + "grad_norm": 1.3382292985916138, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 8690 + }, + { + "epoch": 7.702523240371846, + "grad_norm": 1.2282090187072754, + "learning_rate": 0.0002, + "loss": 0.9996, + "step": 8700 + }, + { + "epoch": 7.711376715360779, + "grad_norm": 1.4728269577026367, + "learning_rate": 0.0002, + "loss": 1.0211, + "step": 8710 + }, + { + "epoch": 7.720230190349712, + "grad_norm": 1.0538904666900635, + "learning_rate": 0.0002, + "loss": 0.9809, + "step": 8720 + }, + { + "epoch": 7.729083665338646, + "grad_norm": 1.3364583253860474, + "learning_rate": 0.0002, + "loss": 0.947, + "step": 8730 + }, + { + "epoch": 7.7379371403275785, + "grad_norm": 1.4484362602233887, + "learning_rate": 0.0002, + "loss": 0.9769, + "step": 8740 + }, + { + "epoch": 7.746790615316511, + "grad_norm": 1.3406230211257935, + "learning_rate": 0.0002, + "loss": 0.9577, + "step": 8750 + }, + { + "epoch": 7.755644090305445, + "grad_norm": 1.3675546646118164, + "learning_rate": 0.0002, + "loss": 1.1238, + "step": 8760 + }, + { + "epoch": 7.764497565294378, + "grad_norm": 1.490721344947815, + "learning_rate": 0.0002, + "loss": 0.9733, + "step": 8770 + }, + { + "epoch": 7.773351040283311, + "grad_norm": 1.267425775527954, + "learning_rate": 0.0002, + "loss": 0.9654, + "step": 8780 + }, + { + "epoch": 7.7822045152722445, + "grad_norm": 1.3113083839416504, + "learning_rate": 0.0002, + "loss": 0.9661, + "step": 8790 + }, + { + "epoch": 7.791057990261177, + "grad_norm": 1.2262369394302368, + "learning_rate": 0.0002, + "loss": 1.0064, + "step": 8800 + }, + { + "epoch": 7.79991146525011, + "grad_norm": 1.2927134037017822, + "learning_rate": 0.0002, + "loss": 1.0915, + "step": 8810 + }, + { + "epoch": 7.808764940239044, + "grad_norm": 1.2576160430908203, + "learning_rate": 0.0002, + "loss": 1.0308, + "step": 8820 + }, + { + "epoch": 7.817618415227977, + "grad_norm": 1.3690781593322754, + "learning_rate": 0.0002, + "loss": 1.0077, + "step": 8830 + }, + { + "epoch": 7.82647189021691, + "grad_norm": 1.3828307390213013, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 8840 + }, + { + "epoch": 7.835325365205843, + "grad_norm": 1.4861878156661987, + "learning_rate": 0.0002, + "loss": 1.0492, + "step": 8850 + }, + { + "epoch": 7.844178840194776, + "grad_norm": 1.403618335723877, + "learning_rate": 0.0002, + "loss": 1.062, + "step": 8860 + }, + { + "epoch": 7.853032315183709, + "grad_norm": 1.4410181045532227, + "learning_rate": 0.0002, + "loss": 1.0392, + "step": 8870 + }, + { + "epoch": 7.861885790172643, + "grad_norm": 1.4488197565078735, + "learning_rate": 0.0002, + "loss": 0.9652, + "step": 8880 + }, + { + "epoch": 7.870739265161576, + "grad_norm": 1.6135752201080322, + "learning_rate": 0.0002, + "loss": 1.0167, + "step": 8890 + }, + { + "epoch": 7.879592740150509, + "grad_norm": 1.264705777168274, + "learning_rate": 0.0002, + "loss": 1.0166, + "step": 8900 + }, + { + "epoch": 7.888446215139442, + "grad_norm": 1.308629035949707, + "learning_rate": 0.0002, + "loss": 1.0288, + "step": 8910 + }, + { + "epoch": 7.897299690128375, + "grad_norm": 1.3849096298217773, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 8920 + }, + { + "epoch": 7.906153165117309, + "grad_norm": 1.4319216012954712, + "learning_rate": 0.0002, + "loss": 1.0059, + "step": 8930 + }, + { + "epoch": 7.915006640106242, + "grad_norm": 1.2494885921478271, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 8940 + }, + { + "epoch": 7.923860115095175, + "grad_norm": 1.4066457748413086, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 8950 + }, + { + "epoch": 7.932713590084108, + "grad_norm": 1.285872459411621, + "learning_rate": 0.0002, + "loss": 1.0867, + "step": 8960 + }, + { + "epoch": 7.941567065073041, + "grad_norm": 1.2378270626068115, + "learning_rate": 0.0002, + "loss": 1.0228, + "step": 8970 + }, + { + "epoch": 7.950420540061974, + "grad_norm": 1.547827124595642, + "learning_rate": 0.0002, + "loss": 1.0107, + "step": 8980 + }, + { + "epoch": 7.959274015050908, + "grad_norm": 1.539252519607544, + "learning_rate": 0.0002, + "loss": 1.0742, + "step": 8990 + }, + { + "epoch": 7.968127490039841, + "grad_norm": 1.230036973953247, + "learning_rate": 0.0002, + "loss": 1.0258, + "step": 9000 + }, + { + "epoch": 7.9769809650287735, + "grad_norm": 1.4130570888519287, + "learning_rate": 0.0002, + "loss": 1.0198, + "step": 9010 + }, + { + "epoch": 7.985834440017707, + "grad_norm": 1.4037895202636719, + "learning_rate": 0.0002, + "loss": 1.0025, + "step": 9020 + }, + { + "epoch": 7.99468791500664, + "grad_norm": 1.4847569465637207, + "learning_rate": 0.0002, + "loss": 0.9551, + "step": 9030 + }, + { + "epoch": 7.996458610004427, + "eval_loss": 2.294736385345459, + "eval_runtime": 82.9425, + "eval_samples_per_second": 6.209, + "eval_steps_per_second": 0.784, + "step": 9032 + } + ], + "logging_steps": 10, + "max_steps": 9032, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.17980736532906e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-9032/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6408cb7ed0be645d6fb12efb9ebcd7bcab9463e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:502feef99fedeea2677424fa05ac9dd15bf387252b0a48aac7fcee8dbc277440 +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..491af3e6c6ad6ac8c83b1dc6ccff0104290a7a7b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9995573262505534, "step": 1129, "epoch_duration": 2897.501981973648, "total_accumulated_duration": 2897.501981973648, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4916, "grad_norm": 0.4775333106517792, "learning_rate": 0.0002, "epoch": 0.008853474988933156, "step": 10}, {"loss": 2.3137, "grad_norm": 0.5485824346542358, "learning_rate": 0.0002, "epoch": 0.017706949977866312, "step": 20}, {"loss": 2.0984, "grad_norm": 0.5675218105316162, "learning_rate": 0.0002, "epoch": 0.02656042496679947, "step": 30}, {"loss": 2.0622, "grad_norm": 0.696494460105896, "learning_rate": 0.0002, "epoch": 0.035413899955732624, "step": 40}, {"loss": 1.9547, "grad_norm": 0.4788398742675781, "learning_rate": 0.0002, "epoch": 0.04426737494466578, "step": 50}, {"loss": 1.8722, "grad_norm": 0.4763128161430359, "learning_rate": 0.0002, "epoch": 0.05312084993359894, "step": 60}, {"loss": 1.8632, "grad_norm": 0.5929698348045349, "learning_rate": 0.0002, "epoch": 0.0619743249225321, "step": 70}, {"loss": 1.9573, "grad_norm": 0.5899396538734436, "learning_rate": 0.0002, "epoch": 0.07082779991146525, "step": 80}, {"loss": 1.8308, "grad_norm": 0.460123747587204, "learning_rate": 0.0002, "epoch": 0.0796812749003984, "step": 90}, {"loss": 1.7615, "grad_norm": 0.4184812009334564, "learning_rate": 0.0002, "epoch": 0.08853474988933156, "step": 100}, {"loss": 1.8079, "grad_norm": 0.4051891267299652, "learning_rate": 0.0002, "epoch": 0.09738822487826472, "step": 110}, {"loss": 1.8911, "grad_norm": 0.3709661066532135, "learning_rate": 0.0002, "epoch": 0.10624169986719788, "step": 120}, {"loss": 1.8695, "grad_norm": 0.4783487915992737, "learning_rate": 0.0002, "epoch": 0.11509517485613104, "step": 130}, {"loss": 1.8602, "grad_norm": 0.36478137969970703, "learning_rate": 0.0002, "epoch": 0.1239486498450642, "step": 140}, {"loss": 1.7814, "grad_norm": 0.4005294442176819, "learning_rate": 0.0002, "epoch": 0.13280212483399734, "step": 150}, {"loss": 1.799, "grad_norm": 0.42357513308525085, "learning_rate": 0.0002, "epoch": 0.1416555998229305, "step": 160}, {"loss": 1.8835, "grad_norm": 0.3913971781730652, "learning_rate": 0.0002, "epoch": 0.15050907481186365, "step": 170}, {"loss": 1.8507, "grad_norm": 0.4650019407272339, "learning_rate": 0.0002, "epoch": 0.1593625498007968, "step": 180}, {"loss": 1.8036, "grad_norm": 0.5545958876609802, "learning_rate": 0.0002, "epoch": 0.16821602478972997, "step": 190}, {"loss": 1.8676, "grad_norm": 0.3669356107711792, "learning_rate": 0.0002, "epoch": 0.17706949977866313, "step": 200}, {"loss": 1.8169, "grad_norm": 0.3683622181415558, "learning_rate": 0.0002, "epoch": 0.18592297476759628, "step": 210}, {"loss": 1.8117, "grad_norm": 0.39825671911239624, "learning_rate": 0.0002, "epoch": 0.19477644975652944, "step": 220}, {"loss": 1.8332, "grad_norm": 0.4298318326473236, "learning_rate": 0.0002, "epoch": 0.2036299247454626, "step": 230}, {"loss": 1.8339, "grad_norm": 0.36111244559288025, "learning_rate": 0.0002, "epoch": 0.21248339973439576, "step": 240}, {"loss": 1.78, "grad_norm": 0.3711858093738556, "learning_rate": 0.0002, "epoch": 0.2213368747233289, "step": 250}, {"loss": 1.8643, "grad_norm": 0.37717559933662415, "learning_rate": 0.0002, "epoch": 0.23019034971226207, "step": 260}, {"loss": 1.7683, "grad_norm": 0.3678877651691437, "learning_rate": 0.0002, "epoch": 0.23904382470119523, "step": 270}, {"loss": 1.8235, "grad_norm": 0.4165912866592407, "learning_rate": 0.0002, "epoch": 0.2478972996901284, "step": 280}, {"loss": 1.8033, "grad_norm": 0.3403240740299225, "learning_rate": 0.0002, "epoch": 0.25675077467906154, "step": 290}, {"loss": 1.8704, "grad_norm": 0.4023234248161316, "learning_rate": 0.0002, "epoch": 0.2656042496679947, "step": 300}, {"loss": 1.7721, "grad_norm": 0.32472360134124756, "learning_rate": 0.0002, "epoch": 0.27445772465692786, "step": 310}, {"loss": 1.8544, "grad_norm": 0.36464595794677734, "learning_rate": 0.0002, "epoch": 0.283311199645861, "step": 320}, {"loss": 1.8168, "grad_norm": 0.3868598937988281, "learning_rate": 0.0002, "epoch": 0.2921646746347942, "step": 330}, {"loss": 1.772, "grad_norm": 0.3123539686203003, "learning_rate": 0.0002, "epoch": 0.3010181496237273, "step": 340}, {"loss": 1.8285, "grad_norm": 0.3392639458179474, "learning_rate": 0.0002, "epoch": 0.3098716246126605, "step": 350}, {"loss": 1.806, "grad_norm": 0.42070651054382324, "learning_rate": 0.0002, "epoch": 0.3187250996015936, "step": 360}, {"loss": 1.8319, "grad_norm": 0.3650900423526764, "learning_rate": 0.0002, "epoch": 0.3275785745905268, "step": 370}, {"loss": 1.8388, "grad_norm": 0.41388973593711853, "learning_rate": 0.0002, "epoch": 0.33643204957945994, "step": 380}, {"loss": 1.79, "grad_norm": 0.36625272035598755, "learning_rate": 0.0002, "epoch": 0.3452855245683931, "step": 390}, {"loss": 1.8271, "grad_norm": 0.3930284082889557, "learning_rate": 0.0002, "epoch": 0.35413899955732625, "step": 400}, {"loss": 1.8664, "grad_norm": 0.3415820300579071, "learning_rate": 0.0002, "epoch": 0.3629924745462594, "step": 410}, {"loss": 1.8885, "grad_norm": 0.4256570041179657, "learning_rate": 0.0002, "epoch": 0.37184594953519257, "step": 420}, {"loss": 1.7728, "grad_norm": 0.3740842938423157, "learning_rate": 0.0002, "epoch": 0.3806994245241257, "step": 430}, {"loss": 1.7676, "grad_norm": 0.334108829498291, "learning_rate": 0.0002, "epoch": 0.3895528995130589, "step": 440}, {"loss": 1.7837, "grad_norm": 0.33186739683151245, "learning_rate": 0.0002, "epoch": 0.398406374501992, "step": 450}, {"loss": 1.8885, "grad_norm": 0.39127954840660095, "learning_rate": 0.0002, "epoch": 0.4072598494909252, "step": 460}, {"loss": 1.8053, "grad_norm": 0.331443727016449, "learning_rate": 0.0002, "epoch": 0.4161133244798583, "step": 470}, {"loss": 1.783, "grad_norm": 0.36834150552749634, "learning_rate": 0.0002, "epoch": 0.4249667994687915, "step": 480}, {"loss": 1.7549, "grad_norm": 0.338123619556427, "learning_rate": 0.0002, "epoch": 0.43382027445772464, "step": 490}, {"loss": 1.795, "grad_norm": 0.3891060948371887, "learning_rate": 0.0002, "epoch": 0.4426737494466578, "step": 500}, {"loss": 1.7639, "grad_norm": 0.3486529290676117, "learning_rate": 0.0002, "epoch": 0.45152722443559096, "step": 510}, {"loss": 1.796, "grad_norm": 0.3635135889053345, "learning_rate": 0.0002, "epoch": 0.46038069942452414, "step": 520}, {"loss": 1.8068, "grad_norm": 0.7706693410873413, "learning_rate": 0.0002, "epoch": 0.4692341744134573, "step": 530}, {"loss": 1.8048, "grad_norm": 0.33725443482398987, "learning_rate": 0.0002, "epoch": 0.47808764940239046, "step": 540}, {"loss": 1.8023, "grad_norm": 0.3127504289150238, "learning_rate": 0.0002, "epoch": 0.4869411243913236, "step": 550}, {"loss": 1.7693, "grad_norm": 0.3527977466583252, "learning_rate": 0.0002, "epoch": 0.4957945993802568, "step": 560}, {"loss": 1.7989, "grad_norm": 0.3574548661708832, "learning_rate": 0.0002, "epoch": 0.5046480743691899, "step": 570}, {"loss": 1.7699, "grad_norm": 0.32787248492240906, "learning_rate": 0.0002, "epoch": 0.5135015493581231, "step": 580}, {"loss": 1.7502, "grad_norm": 0.3309430778026581, "learning_rate": 0.0002, "epoch": 0.5223550243470563, "step": 590}, {"loss": 1.7798, "grad_norm": 0.34276407957077026, "learning_rate": 0.0002, "epoch": 0.5312084993359893, "step": 600}, {"loss": 1.7517, "grad_norm": 0.3343711495399475, "learning_rate": 0.0002, "epoch": 0.5400619743249225, "step": 610}, {"loss": 1.7661, "grad_norm": 0.3193040192127228, "learning_rate": 0.0002, "epoch": 0.5489154493138557, "step": 620}, {"loss": 1.7769, "grad_norm": 0.3059828579425812, "learning_rate": 0.0002, "epoch": 0.5577689243027888, "step": 630}, {"loss": 1.8166, "grad_norm": 0.37237173318862915, "learning_rate": 0.0002, "epoch": 0.566622399291722, "step": 640}, {"loss": 1.7531, "grad_norm": 0.36022549867630005, "learning_rate": 0.0002, "epoch": 0.5754758742806552, "step": 650}, {"loss": 1.771, "grad_norm": 0.34974920749664307, "learning_rate": 0.0002, "epoch": 0.5843293492695883, "step": 660}, {"loss": 1.8226, "grad_norm": 0.37135401368141174, "learning_rate": 0.0002, "epoch": 0.5931828242585214, "step": 670}, {"loss": 1.7456, "grad_norm": 0.3385699689388275, "learning_rate": 0.0002, "epoch": 0.6020362992474546, "step": 680}, {"loss": 1.7696, "grad_norm": 0.36015814542770386, "learning_rate": 0.0002, "epoch": 0.6108897742363878, "step": 690}, {"loss": 1.7892, "grad_norm": 0.3503795564174652, "learning_rate": 0.0002, "epoch": 0.619743249225321, "step": 700}, {"loss": 1.7733, "grad_norm": 0.3447190225124359, "learning_rate": 0.0002, "epoch": 0.628596724214254, "step": 710}, {"loss": 1.794, "grad_norm": 0.3193499445915222, "learning_rate": 0.0002, "epoch": 0.6374501992031872, "step": 720}, {"loss": 1.8046, "grad_norm": 0.37058180570602417, "learning_rate": 0.0002, "epoch": 0.6463036741921204, "step": 730}, {"loss": 1.8391, "grad_norm": 0.42216411232948303, "learning_rate": 0.0002, "epoch": 0.6551571491810536, "step": 740}, {"loss": 1.7142, "grad_norm": 0.3091185688972473, "learning_rate": 0.0002, "epoch": 0.6640106241699867, "step": 750}, {"loss": 1.8624, "grad_norm": 0.33168601989746094, "learning_rate": 0.0002, "epoch": 0.6728640991589199, "step": 760}, {"loss": 1.7123, "grad_norm": 0.31269341707229614, "learning_rate": 0.0002, "epoch": 0.6817175741478531, "step": 770}, {"loss": 1.8526, "grad_norm": 0.36125293374061584, "learning_rate": 0.0002, "epoch": 0.6905710491367862, "step": 780}, {"loss": 1.7478, "grad_norm": 0.3145293593406677, "learning_rate": 0.0002, "epoch": 0.6994245241257193, "step": 790}, {"loss": 1.6545, "grad_norm": 0.3611990809440613, "learning_rate": 0.0002, "epoch": 0.7082779991146525, "step": 800}, {"loss": 1.892, "grad_norm": 0.3165971636772156, "learning_rate": 0.0002, "epoch": 0.7171314741035857, "step": 810}, {"loss": 1.8251, "grad_norm": 0.3364323675632477, "learning_rate": 0.0002, "epoch": 0.7259849490925188, "step": 820}, {"loss": 1.8508, "grad_norm": 0.4310600757598877, "learning_rate": 0.0002, "epoch": 0.734838424081452, "step": 830}, {"loss": 1.7816, "grad_norm": 0.3414389491081238, "learning_rate": 0.0002, "epoch": 0.7436918990703851, "step": 840}, {"loss": 1.8148, "grad_norm": 0.35536202788352966, "learning_rate": 0.0002, "epoch": 0.7525453740593183, "step": 850}, {"loss": 1.8241, "grad_norm": 0.3232460618019104, "learning_rate": 0.0002, "epoch": 0.7613988490482514, "step": 860}, {"loss": 1.7312, "grad_norm": 0.32734858989715576, "learning_rate": 0.0002, "epoch": 0.7702523240371846, "step": 870}, {"loss": 1.7241, "grad_norm": 0.3433493673801422, "learning_rate": 0.0002, "epoch": 0.7791057990261178, "step": 880}, {"loss": 1.7375, "grad_norm": 0.33354780077934265, "learning_rate": 0.0002, "epoch": 0.787959274015051, "step": 890}, {"loss": 1.7314, "grad_norm": 0.30728545784950256, "learning_rate": 0.0002, "epoch": 0.796812749003984, "step": 900}, {"loss": 1.8267, "grad_norm": 0.3373030126094818, "learning_rate": 0.0002, "epoch": 0.8056662239929172, "step": 910}, {"loss": 1.8479, "grad_norm": 0.3468782603740692, "learning_rate": 0.0002, "epoch": 0.8145196989818504, "step": 920}, {"loss": 1.8548, "grad_norm": 0.33520200848579407, "learning_rate": 0.0002, "epoch": 0.8233731739707836, "step": 930}, {"loss": 1.7932, "grad_norm": 0.35207098722457886, "learning_rate": 0.0002, "epoch": 0.8322266489597167, "step": 940}, {"loss": 1.7804, "grad_norm": 0.4000207483768463, "learning_rate": 0.0002, "epoch": 0.8410801239486498, "step": 950}, {"loss": 1.7996, "grad_norm": 0.35362836718559265, "learning_rate": 0.0002, "epoch": 0.849933598937583, "step": 960}, {"loss": 1.7497, "grad_norm": 0.3470745086669922, "learning_rate": 0.0002, "epoch": 0.8587870739265162, "step": 970}, {"loss": 1.8174, "grad_norm": 0.31602704524993896, "learning_rate": 0.0002, "epoch": 0.8676405489154493, "step": 980}, {"loss": 1.7734, "grad_norm": 0.3062942326068878, "learning_rate": 0.0002, "epoch": 0.8764940239043825, "step": 990}, {"loss": 1.7804, "grad_norm": 0.36963850259780884, "learning_rate": 0.0002, "epoch": 0.8853474988933157, "step": 1000}, {"loss": 1.7309, "grad_norm": 0.3384034037590027, "learning_rate": 0.0002, "epoch": 0.8942009738822487, "step": 1010}, {"loss": 1.7945, "grad_norm": 0.30436110496520996, "learning_rate": 0.0002, "epoch": 0.9030544488711819, "step": 1020}, {"loss": 1.7126, "grad_norm": 3.499784469604492, "learning_rate": 0.0002, "epoch": 0.9119079238601151, "step": 1030}, {"loss": 1.7847, "grad_norm": 0.3130280375480652, "learning_rate": 0.0002, "epoch": 0.9207613988490483, "step": 1040}, {"loss": 1.7527, "grad_norm": 0.29976674914360046, "learning_rate": 0.0002, "epoch": 0.9296148738379814, "step": 1050}, {"loss": 1.7753, "grad_norm": 0.35852617025375366, "learning_rate": 0.0002, "epoch": 0.9384683488269145, "step": 1060}, {"loss": 1.7507, "grad_norm": 0.3288591504096985, "learning_rate": 0.0002, "epoch": 0.9473218238158477, "step": 1070}, {"loss": 1.8155, "grad_norm": 0.32641634345054626, "learning_rate": 0.0002, "epoch": 0.9561752988047809, "step": 1080}, {"loss": 1.7912, "grad_norm": 0.3305715322494507, "learning_rate": 0.0002, "epoch": 0.965028773793714, "step": 1090}, {"loss": 1.8368, "grad_norm": 0.30650773644447327, "learning_rate": 0.0002, "epoch": 0.9738822487826472, "step": 1100}, {"loss": 1.6739, "grad_norm": 0.3330624997615814, "learning_rate": 0.0002, "epoch": 0.9827357237715804, "step": 1110}, {"loss": 1.8392, "grad_norm": 0.3173314034938812, "learning_rate": 0.0002, "epoch": 0.9915891987605135, "step": 1120}]} +{"epoch": 2.0, "step": 2259, "epoch_duration": 2947.8830506801605, "total_accumulated_duration": 5845.385032653809, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-1129", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4916, "grad_norm": 0.4775333106517792, "learning_rate": 0.0002, "epoch": 0.008853474988933156, "step": 10}, {"loss": 2.3137, "grad_norm": 0.5485824346542358, "learning_rate": 0.0002, "epoch": 0.017706949977866312, "step": 20}, {"loss": 2.0984, "grad_norm": 0.5675218105316162, "learning_rate": 0.0002, "epoch": 0.02656042496679947, "step": 30}, {"loss": 2.0622, "grad_norm": 0.696494460105896, "learning_rate": 0.0002, "epoch": 0.035413899955732624, "step": 40}, {"loss": 1.9547, "grad_norm": 0.4788398742675781, "learning_rate": 0.0002, "epoch": 0.04426737494466578, "step": 50}, {"loss": 1.8722, "grad_norm": 0.4763128161430359, "learning_rate": 0.0002, "epoch": 0.05312084993359894, "step": 60}, {"loss": 1.8632, "grad_norm": 0.5929698348045349, "learning_rate": 0.0002, "epoch": 0.0619743249225321, "step": 70}, {"loss": 1.9573, "grad_norm": 0.5899396538734436, "learning_rate": 0.0002, "epoch": 0.07082779991146525, "step": 80}, {"loss": 1.8308, "grad_norm": 0.460123747587204, "learning_rate": 0.0002, "epoch": 0.0796812749003984, "step": 90}, {"loss": 1.7615, "grad_norm": 0.4184812009334564, "learning_rate": 0.0002, "epoch": 0.08853474988933156, "step": 100}, {"loss": 1.8079, "grad_norm": 0.4051891267299652, "learning_rate": 0.0002, "epoch": 0.09738822487826472, "step": 110}, {"loss": 1.8911, "grad_norm": 0.3709661066532135, "learning_rate": 0.0002, "epoch": 0.10624169986719788, "step": 120}, {"loss": 1.8695, "grad_norm": 0.4783487915992737, "learning_rate": 0.0002, "epoch": 0.11509517485613104, "step": 130}, {"loss": 1.8602, "grad_norm": 0.36478137969970703, "learning_rate": 0.0002, "epoch": 0.1239486498450642, "step": 140}, {"loss": 1.7814, "grad_norm": 0.4005294442176819, "learning_rate": 0.0002, "epoch": 0.13280212483399734, "step": 150}, {"loss": 1.799, "grad_norm": 0.42357513308525085, "learning_rate": 0.0002, "epoch": 0.1416555998229305, "step": 160}, {"loss": 1.8835, "grad_norm": 0.3913971781730652, "learning_rate": 0.0002, "epoch": 0.15050907481186365, "step": 170}, {"loss": 1.8507, "grad_norm": 0.4650019407272339, "learning_rate": 0.0002, "epoch": 0.1593625498007968, "step": 180}, {"loss": 1.8036, "grad_norm": 0.5545958876609802, "learning_rate": 0.0002, "epoch": 0.16821602478972997, "step": 190}, {"loss": 1.8676, "grad_norm": 0.3669356107711792, "learning_rate": 0.0002, "epoch": 0.17706949977866313, "step": 200}, {"loss": 1.8169, "grad_norm": 0.3683622181415558, "learning_rate": 0.0002, "epoch": 0.18592297476759628, "step": 210}, {"loss": 1.8117, "grad_norm": 0.39825671911239624, "learning_rate": 0.0002, "epoch": 0.19477644975652944, "step": 220}, {"loss": 1.8332, "grad_norm": 0.4298318326473236, "learning_rate": 0.0002, "epoch": 0.2036299247454626, "step": 230}, {"loss": 1.8339, "grad_norm": 0.36111244559288025, "learning_rate": 0.0002, "epoch": 0.21248339973439576, "step": 240}, {"loss": 1.78, "grad_norm": 0.3711858093738556, "learning_rate": 0.0002, "epoch": 0.2213368747233289, "step": 250}, {"loss": 1.8643, "grad_norm": 0.37717559933662415, "learning_rate": 0.0002, "epoch": 0.23019034971226207, "step": 260}, {"loss": 1.7683, "grad_norm": 0.3678877651691437, "learning_rate": 0.0002, "epoch": 0.23904382470119523, "step": 270}, {"loss": 1.8235, "grad_norm": 0.4165912866592407, "learning_rate": 0.0002, "epoch": 0.2478972996901284, "step": 280}, {"loss": 1.8033, "grad_norm": 0.3403240740299225, "learning_rate": 0.0002, "epoch": 0.25675077467906154, "step": 290}, {"loss": 1.8704, "grad_norm": 0.4023234248161316, "learning_rate": 0.0002, "epoch": 0.2656042496679947, "step": 300}, {"loss": 1.7721, "grad_norm": 0.32472360134124756, "learning_rate": 0.0002, "epoch": 0.27445772465692786, "step": 310}, {"loss": 1.8544, "grad_norm": 0.36464595794677734, "learning_rate": 0.0002, "epoch": 0.283311199645861, "step": 320}, {"loss": 1.8168, "grad_norm": 0.3868598937988281, "learning_rate": 0.0002, "epoch": 0.2921646746347942, "step": 330}, {"loss": 1.772, "grad_norm": 0.3123539686203003, "learning_rate": 0.0002, "epoch": 0.3010181496237273, "step": 340}, {"loss": 1.8285, "grad_norm": 0.3392639458179474, "learning_rate": 0.0002, "epoch": 0.3098716246126605, "step": 350}, {"loss": 1.806, "grad_norm": 0.42070651054382324, "learning_rate": 0.0002, "epoch": 0.3187250996015936, "step": 360}, {"loss": 1.8319, "grad_norm": 0.3650900423526764, "learning_rate": 0.0002, "epoch": 0.3275785745905268, "step": 370}, {"loss": 1.8388, "grad_norm": 0.41388973593711853, "learning_rate": 0.0002, "epoch": 0.33643204957945994, "step": 380}, {"loss": 1.79, "grad_norm": 0.36625272035598755, "learning_rate": 0.0002, "epoch": 0.3452855245683931, "step": 390}, {"loss": 1.8271, "grad_norm": 0.3930284082889557, "learning_rate": 0.0002, "epoch": 0.35413899955732625, "step": 400}, {"loss": 1.8664, "grad_norm": 0.3415820300579071, "learning_rate": 0.0002, "epoch": 0.3629924745462594, "step": 410}, {"loss": 1.8885, "grad_norm": 0.4256570041179657, "learning_rate": 0.0002, "epoch": 0.37184594953519257, "step": 420}, {"loss": 1.7728, "grad_norm": 0.3740842938423157, "learning_rate": 0.0002, "epoch": 0.3806994245241257, "step": 430}, {"loss": 1.7676, "grad_norm": 0.334108829498291, "learning_rate": 0.0002, "epoch": 0.3895528995130589, "step": 440}, {"loss": 1.7837, "grad_norm": 0.33186739683151245, "learning_rate": 0.0002, "epoch": 0.398406374501992, "step": 450}, {"loss": 1.8885, "grad_norm": 0.39127954840660095, "learning_rate": 0.0002, "epoch": 0.4072598494909252, "step": 460}, {"loss": 1.8053, "grad_norm": 0.331443727016449, "learning_rate": 0.0002, "epoch": 0.4161133244798583, "step": 470}, {"loss": 1.783, "grad_norm": 0.36834150552749634, "learning_rate": 0.0002, "epoch": 0.4249667994687915, "step": 480}, {"loss": 1.7549, "grad_norm": 0.338123619556427, "learning_rate": 0.0002, "epoch": 0.43382027445772464, "step": 490}, {"loss": 1.795, "grad_norm": 0.3891060948371887, "learning_rate": 0.0002, "epoch": 0.4426737494466578, "step": 500}, {"loss": 1.7639, "grad_norm": 0.3486529290676117, "learning_rate": 0.0002, "epoch": 0.45152722443559096, "step": 510}, {"loss": 1.796, "grad_norm": 0.3635135889053345, "learning_rate": 0.0002, "epoch": 0.46038069942452414, "step": 520}, {"loss": 1.8068, "grad_norm": 0.7706693410873413, "learning_rate": 0.0002, "epoch": 0.4692341744134573, "step": 530}, {"loss": 1.8048, "grad_norm": 0.33725443482398987, "learning_rate": 0.0002, "epoch": 0.47808764940239046, "step": 540}, {"loss": 1.8023, "grad_norm": 0.3127504289150238, "learning_rate": 0.0002, "epoch": 0.4869411243913236, "step": 550}, {"loss": 1.7693, "grad_norm": 0.3527977466583252, "learning_rate": 0.0002, "epoch": 0.4957945993802568, "step": 560}, {"loss": 1.7989, "grad_norm": 0.3574548661708832, "learning_rate": 0.0002, "epoch": 0.5046480743691899, "step": 570}, {"loss": 1.7699, "grad_norm": 0.32787248492240906, "learning_rate": 0.0002, "epoch": 0.5135015493581231, "step": 580}, {"loss": 1.7502, "grad_norm": 0.3309430778026581, "learning_rate": 0.0002, "epoch": 0.5223550243470563, "step": 590}, {"loss": 1.7798, "grad_norm": 0.34276407957077026, "learning_rate": 0.0002, "epoch": 0.5312084993359893, "step": 600}, {"loss": 1.7517, "grad_norm": 0.3343711495399475, "learning_rate": 0.0002, "epoch": 0.5400619743249225, "step": 610}, {"loss": 1.7661, "grad_norm": 0.3193040192127228, "learning_rate": 0.0002, "epoch": 0.5489154493138557, "step": 620}, {"loss": 1.7769, "grad_norm": 0.3059828579425812, "learning_rate": 0.0002, "epoch": 0.5577689243027888, "step": 630}, {"loss": 1.8166, "grad_norm": 0.37237173318862915, "learning_rate": 0.0002, "epoch": 0.566622399291722, "step": 640}, {"loss": 1.7531, "grad_norm": 0.36022549867630005, "learning_rate": 0.0002, "epoch": 0.5754758742806552, "step": 650}, {"loss": 1.771, "grad_norm": 0.34974920749664307, "learning_rate": 0.0002, "epoch": 0.5843293492695883, "step": 660}, {"loss": 1.8226, "grad_norm": 0.37135401368141174, "learning_rate": 0.0002, "epoch": 0.5931828242585214, "step": 670}, {"loss": 1.7456, "grad_norm": 0.3385699689388275, "learning_rate": 0.0002, "epoch": 0.6020362992474546, "step": 680}, {"loss": 1.7696, "grad_norm": 0.36015814542770386, "learning_rate": 0.0002, "epoch": 0.6108897742363878, "step": 690}, {"loss": 1.7892, "grad_norm": 0.3503795564174652, "learning_rate": 0.0002, "epoch": 0.619743249225321, "step": 700}, {"loss": 1.7733, "grad_norm": 0.3447190225124359, "learning_rate": 0.0002, "epoch": 0.628596724214254, "step": 710}, {"loss": 1.794, "grad_norm": 0.3193499445915222, "learning_rate": 0.0002, "epoch": 0.6374501992031872, "step": 720}, {"loss": 1.8046, "grad_norm": 0.37058180570602417, "learning_rate": 0.0002, "epoch": 0.6463036741921204, "step": 730}, {"loss": 1.8391, "grad_norm": 0.42216411232948303, "learning_rate": 0.0002, "epoch": 0.6551571491810536, "step": 740}, {"loss": 1.7142, "grad_norm": 0.3091185688972473, "learning_rate": 0.0002, "epoch": 0.6640106241699867, "step": 750}, {"loss": 1.8624, "grad_norm": 0.33168601989746094, "learning_rate": 0.0002, "epoch": 0.6728640991589199, "step": 760}, {"loss": 1.7123, "grad_norm": 0.31269341707229614, "learning_rate": 0.0002, "epoch": 0.6817175741478531, "step": 770}, {"loss": 1.8526, "grad_norm": 0.36125293374061584, "learning_rate": 0.0002, "epoch": 0.6905710491367862, "step": 780}, {"loss": 1.7478, "grad_norm": 0.3145293593406677, "learning_rate": 0.0002, "epoch": 0.6994245241257193, "step": 790}, {"loss": 1.6545, "grad_norm": 0.3611990809440613, "learning_rate": 0.0002, "epoch": 0.7082779991146525, "step": 800}, {"loss": 1.892, "grad_norm": 0.3165971636772156, "learning_rate": 0.0002, "epoch": 0.7171314741035857, "step": 810}, {"loss": 1.8251, "grad_norm": 0.3364323675632477, "learning_rate": 0.0002, "epoch": 0.7259849490925188, "step": 820}, {"loss": 1.8508, "grad_norm": 0.4310600757598877, "learning_rate": 0.0002, "epoch": 0.734838424081452, "step": 830}, {"loss": 1.7816, "grad_norm": 0.3414389491081238, "learning_rate": 0.0002, "epoch": 0.7436918990703851, "step": 840}, {"loss": 1.8148, "grad_norm": 0.35536202788352966, "learning_rate": 0.0002, "epoch": 0.7525453740593183, "step": 850}, {"loss": 1.8241, "grad_norm": 0.3232460618019104, "learning_rate": 0.0002, "epoch": 0.7613988490482514, "step": 860}, {"loss": 1.7312, "grad_norm": 0.32734858989715576, "learning_rate": 0.0002, "epoch": 0.7702523240371846, "step": 870}, {"loss": 1.7241, "grad_norm": 0.3433493673801422, "learning_rate": 0.0002, "epoch": 0.7791057990261178, "step": 880}, {"loss": 1.7375, "grad_norm": 0.33354780077934265, "learning_rate": 0.0002, "epoch": 0.787959274015051, "step": 890}, {"loss": 1.7314, "grad_norm": 0.30728545784950256, "learning_rate": 0.0002, "epoch": 0.796812749003984, "step": 900}, {"loss": 1.8267, "grad_norm": 0.3373030126094818, "learning_rate": 0.0002, "epoch": 0.8056662239929172, "step": 910}, {"loss": 1.8479, "grad_norm": 0.3468782603740692, "learning_rate": 0.0002, "epoch": 0.8145196989818504, "step": 920}, {"loss": 1.8548, "grad_norm": 0.33520200848579407, "learning_rate": 0.0002, "epoch": 0.8233731739707836, "step": 930}, {"loss": 1.7932, "grad_norm": 0.35207098722457886, "learning_rate": 0.0002, "epoch": 0.8322266489597167, "step": 940}, {"loss": 1.7804, "grad_norm": 0.4000207483768463, "learning_rate": 0.0002, "epoch": 0.8410801239486498, "step": 950}, {"loss": 1.7996, "grad_norm": 0.35362836718559265, "learning_rate": 0.0002, "epoch": 0.849933598937583, "step": 960}, {"loss": 1.7497, "grad_norm": 0.3470745086669922, "learning_rate": 0.0002, "epoch": 0.8587870739265162, "step": 970}, {"loss": 1.8174, "grad_norm": 0.31602704524993896, "learning_rate": 0.0002, "epoch": 0.8676405489154493, "step": 980}, {"loss": 1.7734, "grad_norm": 0.3062942326068878, "learning_rate": 0.0002, "epoch": 0.8764940239043825, "step": 990}, {"loss": 1.7804, "grad_norm": 0.36963850259780884, "learning_rate": 0.0002, "epoch": 0.8853474988933157, "step": 1000}, {"loss": 1.7309, "grad_norm": 0.3384034037590027, "learning_rate": 0.0002, "epoch": 0.8942009738822487, "step": 1010}, {"loss": 1.7945, "grad_norm": 0.30436110496520996, "learning_rate": 0.0002, "epoch": 0.9030544488711819, "step": 1020}, {"loss": 1.7126, "grad_norm": 3.499784469604492, "learning_rate": 0.0002, "epoch": 0.9119079238601151, "step": 1030}, {"loss": 1.7847, "grad_norm": 0.3130280375480652, "learning_rate": 0.0002, "epoch": 0.9207613988490483, "step": 1040}, {"loss": 1.7527, "grad_norm": 0.29976674914360046, "learning_rate": 0.0002, "epoch": 0.9296148738379814, "step": 1050}, {"loss": 1.7753, "grad_norm": 0.35852617025375366, "learning_rate": 0.0002, "epoch": 0.9384683488269145, "step": 1060}, {"loss": 1.7507, "grad_norm": 0.3288591504096985, "learning_rate": 0.0002, "epoch": 0.9473218238158477, "step": 1070}, {"loss": 1.8155, "grad_norm": 0.32641634345054626, "learning_rate": 0.0002, "epoch": 0.9561752988047809, "step": 1080}, {"loss": 1.7912, "grad_norm": 0.3305715322494507, "learning_rate": 0.0002, "epoch": 0.965028773793714, "step": 1090}, {"loss": 1.8368, "grad_norm": 0.30650773644447327, "learning_rate": 0.0002, "epoch": 0.9738822487826472, "step": 1100}, {"loss": 1.6739, "grad_norm": 0.3330624997615814, "learning_rate": 0.0002, "epoch": 0.9827357237715804, "step": 1110}, {"loss": 1.8392, "grad_norm": 0.3173314034938812, "learning_rate": 0.0002, "epoch": 0.9915891987605135, "step": 1120}, {"eval_loss": 1.8095673322677612, "eval_runtime": 82.6312, "eval_samples_per_second": 6.233, "eval_steps_per_second": 0.787, "epoch": 0.9995573262505534, "step": 1129}, {"loss": 1.7997, "grad_norm": 0.3092995882034302, "learning_rate": 0.0002, "epoch": 1.0004426737494467, "step": 1130}, {"loss": 1.6958, "grad_norm": 0.34386494755744934, "learning_rate": 0.0002, "epoch": 1.0092961487383798, "step": 1140}, {"loss": 1.7149, "grad_norm": 0.2887897789478302, "learning_rate": 0.0002, "epoch": 1.0181496237273129, "step": 1150}, {"loss": 1.7377, "grad_norm": 0.3706893026828766, "learning_rate": 0.0002, "epoch": 1.0270030987162462, "step": 1160}, {"loss": 1.6604, "grad_norm": 0.34724316000938416, "learning_rate": 0.0002, "epoch": 1.0358565737051793, "step": 1170}, {"loss": 1.7749, "grad_norm": 0.41001757979393005, "learning_rate": 0.0002, "epoch": 1.0447100486941125, "step": 1180}, {"loss": 1.6332, "grad_norm": 0.34838348627090454, "learning_rate": 0.0002, "epoch": 1.0535635236830456, "step": 1190}, {"loss": 1.7416, "grad_norm": 0.37201181054115295, "learning_rate": 0.0002, "epoch": 1.0624169986719787, "step": 1200}, {"loss": 1.7707, "grad_norm": 0.36871352791786194, "learning_rate": 0.0002, "epoch": 1.071270473660912, "step": 1210}, {"loss": 1.6769, "grad_norm": 0.35687458515167236, "learning_rate": 0.0002, "epoch": 1.080123948649845, "step": 1220}, {"loss": 1.7235, "grad_norm": 0.3864741921424866, "learning_rate": 0.0002, "epoch": 1.0889774236387781, "step": 1230}, {"loss": 1.729, "grad_norm": 0.3496808707714081, "learning_rate": 0.0002, "epoch": 1.0978308986277114, "step": 1240}, {"loss": 1.7192, "grad_norm": 0.3444930911064148, "learning_rate": 0.0002, "epoch": 1.1066843736166445, "step": 1250}, {"loss": 1.6672, "grad_norm": 0.353188693523407, "learning_rate": 0.0002, "epoch": 1.1155378486055776, "step": 1260}, {"loss": 1.7634, "grad_norm": 0.3284400999546051, "learning_rate": 0.0002, "epoch": 1.1243913235945109, "step": 1270}, {"loss": 1.7441, "grad_norm": 0.3545348644256592, "learning_rate": 0.0002, "epoch": 1.133244798583444, "step": 1280}, {"loss": 1.7343, "grad_norm": 0.3489900529384613, "learning_rate": 0.0002, "epoch": 1.1420982735723773, "step": 1290}, {"loss": 1.6399, "grad_norm": 0.40355560183525085, "learning_rate": 0.0002, "epoch": 1.1509517485613103, "step": 1300}, {"loss": 1.7658, "grad_norm": 0.3369944095611572, "learning_rate": 0.0002, "epoch": 1.1598052235502434, "step": 1310}, {"loss": 1.7098, "grad_norm": 0.39141345024108887, "learning_rate": 0.0002, "epoch": 1.1686586985391767, "step": 1320}, {"loss": 1.6628, "grad_norm": 0.36518552899360657, "learning_rate": 0.0002, "epoch": 1.1775121735281098, "step": 1330}, {"loss": 1.6958, "grad_norm": 0.3730056583881378, "learning_rate": 0.0002, "epoch": 1.1863656485170428, "step": 1340}, {"loss": 1.7613, "grad_norm": 0.37711501121520996, "learning_rate": 0.0002, "epoch": 1.1952191235059761, "step": 1350}, {"loss": 1.6423, "grad_norm": 0.3627128005027771, "learning_rate": 0.0002, "epoch": 1.2040725984949092, "step": 1360}, {"loss": 1.7214, "grad_norm": 0.3458651006221771, "learning_rate": 0.0002, "epoch": 1.2129260734838425, "step": 1370}, {"loss": 1.6978, "grad_norm": 0.392395555973053, "learning_rate": 0.0002, "epoch": 1.2217795484727756, "step": 1380}, {"loss": 1.7785, "grad_norm": 0.3353286683559418, "learning_rate": 0.0002, "epoch": 1.2306330234617087, "step": 1390}, {"loss": 1.7019, "grad_norm": 0.9545007944107056, "learning_rate": 0.0002, "epoch": 1.239486498450642, "step": 1400}, {"loss": 1.725, "grad_norm": 0.37037935853004456, "learning_rate": 0.0002, "epoch": 1.248339973439575, "step": 1410}, {"loss": 1.6818, "grad_norm": 0.3831497132778168, "learning_rate": 0.0002, "epoch": 1.257193448428508, "step": 1420}, {"loss": 1.747, "grad_norm": 0.4633576273918152, "learning_rate": 0.0002, "epoch": 1.2660469234174414, "step": 1430}, {"loss": 1.6864, "grad_norm": 0.3690567910671234, "learning_rate": 0.0002, "epoch": 1.2749003984063745, "step": 1440}, {"loss": 1.767, "grad_norm": 0.33980098366737366, "learning_rate": 0.0002, "epoch": 1.2837538733953076, "step": 1450}, {"loss": 1.6989, "grad_norm": 0.3731277287006378, "learning_rate": 0.0002, "epoch": 1.2926073483842409, "step": 1460}, {"loss": 1.6801, "grad_norm": 0.3781551122665405, "learning_rate": 0.0002, "epoch": 1.301460823373174, "step": 1470}, {"loss": 1.7551, "grad_norm": 0.36511561274528503, "learning_rate": 0.0002, "epoch": 1.310314298362107, "step": 1480}, {"loss": 1.6629, "grad_norm": 0.3292245864868164, "learning_rate": 0.0002, "epoch": 1.3191677733510403, "step": 1490}, {"loss": 1.7098, "grad_norm": 0.38758566975593567, "learning_rate": 0.0002, "epoch": 1.3280212483399734, "step": 1500}, {"loss": 1.7364, "grad_norm": 0.3993414044380188, "learning_rate": 0.0002, "epoch": 1.3368747233289067, "step": 1510}, {"loss": 1.7202, "grad_norm": 0.35689303278923035, "learning_rate": 0.0002, "epoch": 1.3457281983178397, "step": 1520}, {"loss": 1.7082, "grad_norm": 0.41849321126937866, "learning_rate": 0.0002, "epoch": 1.354581673306773, "step": 1530}, {"loss": 1.7488, "grad_norm": 0.36752554774284363, "learning_rate": 0.0002, "epoch": 1.3634351482957061, "step": 1540}, {"loss": 1.7032, "grad_norm": 0.36915940046310425, "learning_rate": 0.0002, "epoch": 1.3722886232846392, "step": 1550}, {"loss": 1.6698, "grad_norm": 0.3656710386276245, "learning_rate": 0.0002, "epoch": 1.3811420982735725, "step": 1560}, {"loss": 1.7269, "grad_norm": 0.32055532932281494, "learning_rate": 0.0002, "epoch": 1.3899955732625056, "step": 1570}, {"loss": 1.8, "grad_norm": 0.35031241178512573, "learning_rate": 0.0002, "epoch": 1.3988490482514386, "step": 1580}, {"loss": 1.6667, "grad_norm": 0.44541189074516296, "learning_rate": 0.0002, "epoch": 1.407702523240372, "step": 1590}, {"loss": 1.8624, "grad_norm": 0.36922356486320496, "learning_rate": 0.0002, "epoch": 1.416555998229305, "step": 1600}, {"loss": 1.7011, "grad_norm": 0.3470565974712372, "learning_rate": 0.0002, "epoch": 1.425409473218238, "step": 1610}, {"loss": 1.6912, "grad_norm": 0.3743111193180084, "learning_rate": 0.0002, "epoch": 1.4342629482071714, "step": 1620}, {"loss": 1.752, "grad_norm": 0.3619250953197479, "learning_rate": 0.0002, "epoch": 1.4431164231961044, "step": 1630}, {"loss": 1.6919, "grad_norm": 0.4028145968914032, "learning_rate": 0.0002, "epoch": 1.4519698981850375, "step": 1640}, {"loss": 1.75, "grad_norm": 0.36065351963043213, "learning_rate": 0.0002, "epoch": 1.4608233731739708, "step": 1650}, {"loss": 1.8212, "grad_norm": 0.44304442405700684, "learning_rate": 0.0002, "epoch": 1.469676848162904, "step": 1660}, {"loss": 1.6691, "grad_norm": 0.35770007967948914, "learning_rate": 0.0002, "epoch": 1.478530323151837, "step": 1670}, {"loss": 1.7588, "grad_norm": 0.37584400177001953, "learning_rate": 0.0002, "epoch": 1.4873837981407703, "step": 1680}, {"loss": 1.63, "grad_norm": 0.37151241302490234, "learning_rate": 0.0002, "epoch": 1.4962372731297033, "step": 1690}, {"loss": 1.6675, "grad_norm": 0.36422812938690186, "learning_rate": 0.0002, "epoch": 1.5050907481186364, "step": 1700}, {"loss": 1.7045, "grad_norm": 0.3680015206336975, "learning_rate": 0.0002, "epoch": 1.5139442231075697, "step": 1710}, {"loss": 1.6917, "grad_norm": 0.3356926441192627, "learning_rate": 0.0002, "epoch": 1.522797698096503, "step": 1720}, {"loss": 1.7108, "grad_norm": 0.37887054681777954, "learning_rate": 0.0002, "epoch": 1.531651173085436, "step": 1730}, {"loss": 1.7001, "grad_norm": 0.37052762508392334, "learning_rate": 0.0002, "epoch": 1.5405046480743692, "step": 1740}, {"loss": 1.6677, "grad_norm": 0.333925724029541, "learning_rate": 0.0002, "epoch": 1.5493581230633025, "step": 1750}, {"loss": 1.7159, "grad_norm": 0.3722778558731079, "learning_rate": 0.0002, "epoch": 1.5582115980522355, "step": 1760}, {"loss": 1.6923, "grad_norm": 0.3331141173839569, "learning_rate": 0.0002, "epoch": 1.5670650730411686, "step": 1770}, {"loss": 1.7444, "grad_norm": 0.3670045733451843, "learning_rate": 0.0002, "epoch": 1.575918548030102, "step": 1780}, {"loss": 1.7092, "grad_norm": 0.3769885301589966, "learning_rate": 0.0002, "epoch": 1.584772023019035, "step": 1790}, {"loss": 1.6689, "grad_norm": 0.4266890287399292, "learning_rate": 0.0002, "epoch": 1.593625498007968, "step": 1800}, {"loss": 1.6859, "grad_norm": 0.37174347043037415, "learning_rate": 0.0002, "epoch": 1.6024789729969013, "step": 1810}, {"loss": 1.6793, "grad_norm": 0.3599846363067627, "learning_rate": 0.0002, "epoch": 1.6113324479858344, "step": 1820}, {"loss": 1.6836, "grad_norm": 0.3364820182323456, "learning_rate": 0.0002, "epoch": 1.6201859229747675, "step": 1830}, {"loss": 1.7278, "grad_norm": 0.3874799907207489, "learning_rate": 0.0002, "epoch": 1.6290393979637008, "step": 1840}, {"loss": 1.705, "grad_norm": 0.3706085681915283, "learning_rate": 0.0002, "epoch": 1.6378928729526339, "step": 1850}, {"loss": 1.6761, "grad_norm": 0.3997809886932373, "learning_rate": 0.0002, "epoch": 1.646746347941567, "step": 1860}, {"loss": 1.7983, "grad_norm": 0.4033166170120239, "learning_rate": 0.0002, "epoch": 1.6555998229305002, "step": 1870}, {"loss": 1.6518, "grad_norm": 0.3944370150566101, "learning_rate": 0.0002, "epoch": 1.6644532979194335, "step": 1880}, {"loss": 1.6017, "grad_norm": 0.3467825651168823, "learning_rate": 0.0002, "epoch": 1.6733067729083664, "step": 1890}, {"loss": 1.7462, "grad_norm": 0.35290950536727905, "learning_rate": 0.0002, "epoch": 1.6821602478972997, "step": 1900}, {"loss": 1.7634, "grad_norm": 0.3664521872997284, "learning_rate": 0.0002, "epoch": 1.691013722886233, "step": 1910}, {"loss": 1.7922, "grad_norm": 0.33863595128059387, "learning_rate": 0.0002, "epoch": 1.699867197875166, "step": 1920}, {"loss": 1.7048, "grad_norm": 0.34726113080978394, "learning_rate": 0.0002, "epoch": 1.7087206728640991, "step": 1930}, {"loss": 1.6664, "grad_norm": 0.35060688853263855, "learning_rate": 0.0002, "epoch": 1.7175741478530324, "step": 1940}, {"loss": 1.7577, "grad_norm": 0.33741647005081177, "learning_rate": 0.0002, "epoch": 1.7264276228419655, "step": 1950}, {"loss": 1.6971, "grad_norm": 0.36190304160118103, "learning_rate": 0.0002, "epoch": 1.7352810978308986, "step": 1960}, {"loss": 1.7238, "grad_norm": 0.3412845730781555, "learning_rate": 0.0002, "epoch": 1.7441345728198319, "step": 1970}, {"loss": 1.7038, "grad_norm": 0.3841935694217682, "learning_rate": 0.0002, "epoch": 1.752988047808765, "step": 1980}, {"loss": 1.7185, "grad_norm": 0.39062076807022095, "learning_rate": 0.0002, "epoch": 1.761841522797698, "step": 1990}, {"loss": 1.7346, "grad_norm": 0.3741697669029236, "learning_rate": 0.0002, "epoch": 1.7706949977866313, "step": 2000}, {"loss": 1.6864, "grad_norm": 0.4160231053829193, "learning_rate": 0.0002, "epoch": 1.7795484727755644, "step": 2010}, {"loss": 1.7572, "grad_norm": 0.3602111339569092, "learning_rate": 0.0002, "epoch": 1.7884019477644975, "step": 2020}, {"loss": 1.6139, "grad_norm": 0.36740878224372864, "learning_rate": 0.0002, "epoch": 1.7972554227534308, "step": 2030}, {"loss": 1.7043, "grad_norm": 0.419039249420166, "learning_rate": 0.0002, "epoch": 1.8061088977423638, "step": 2040}, {"loss": 1.7847, "grad_norm": 0.3511838912963867, "learning_rate": 0.0002, "epoch": 1.814962372731297, "step": 2050}, {"loss": 1.6477, "grad_norm": 0.3580166697502136, "learning_rate": 0.0002, "epoch": 1.8238158477202302, "step": 2060}, {"loss": 1.7562, "grad_norm": 0.40928223729133606, "learning_rate": 0.0002, "epoch": 1.8326693227091635, "step": 2070}, {"loss": 1.7356, "grad_norm": 0.37134310603141785, "learning_rate": 0.0002, "epoch": 1.8415227976980963, "step": 2080}, {"loss": 1.6829, "grad_norm": 0.3924112319946289, "learning_rate": 0.0002, "epoch": 1.8503762726870296, "step": 2090}, {"loss": 1.6785, "grad_norm": 0.3215042054653168, "learning_rate": 0.0002, "epoch": 1.859229747675963, "step": 2100}, {"loss": 1.6864, "grad_norm": 0.37674015760421753, "learning_rate": 0.0002, "epoch": 1.868083222664896, "step": 2110}, {"loss": 1.7313, "grad_norm": 0.370856374502182, "learning_rate": 0.0002, "epoch": 1.876936697653829, "step": 2120}, {"loss": 1.7163, "grad_norm": 0.35783782601356506, "learning_rate": 0.0002, "epoch": 1.8857901726427624, "step": 2130}, {"loss": 1.7655, "grad_norm": 0.39538058638572693, "learning_rate": 0.0002, "epoch": 1.8946436476316955, "step": 2140}, {"loss": 1.6614, "grad_norm": 0.36677780747413635, "learning_rate": 0.0002, "epoch": 1.9034971226206285, "step": 2150}, {"loss": 1.6959, "grad_norm": 0.39032700657844543, "learning_rate": 0.0002, "epoch": 1.9123505976095618, "step": 2160}, {"loss": 1.7643, "grad_norm": 0.39762043952941895, "learning_rate": 0.0002, "epoch": 1.921204072598495, "step": 2170}, {"loss": 1.6767, "grad_norm": 0.5400257110595703, "learning_rate": 0.0002, "epoch": 1.930057547587428, "step": 2180}, {"loss": 1.7262, "grad_norm": 0.3650212287902832, "learning_rate": 0.0002, "epoch": 1.9389110225763613, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3583165109157562, "learning_rate": 0.0002, "epoch": 1.9477644975652944, "step": 2200}, {"loss": 1.7241, "grad_norm": 0.4031282365322113, "learning_rate": 0.0002, "epoch": 1.9566179725542274, "step": 2210}, {"loss": 1.7617, "grad_norm": 0.3673221170902252, "learning_rate": 0.0002, "epoch": 1.9654714475431607, "step": 2220}, {"loss": 1.6862, "grad_norm": 0.3920327126979828, "learning_rate": 0.0002, "epoch": 1.9743249225320938, "step": 2230}, {"loss": 1.7192, "grad_norm": 0.4765491783618927, "learning_rate": 0.0002, "epoch": 1.9831783975210269, "step": 2240}, {"loss": 1.7759, "grad_norm": 0.38130584359169006, "learning_rate": 0.0002, "epoch": 1.9920318725099602, "step": 2250}]} +{"epoch": 2.9995573262505535, "step": 3388, "epoch_duration": 2955.456925392151, "total_accumulated_duration": 8800.84195804596, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4916, "grad_norm": 0.4775333106517792, "learning_rate": 0.0002, "epoch": 0.008853474988933156, "step": 10}, {"loss": 2.3137, "grad_norm": 0.5485824346542358, "learning_rate": 0.0002, "epoch": 0.017706949977866312, "step": 20}, {"loss": 2.0984, "grad_norm": 0.5675218105316162, "learning_rate": 0.0002, "epoch": 0.02656042496679947, "step": 30}, {"loss": 2.0622, "grad_norm": 0.696494460105896, "learning_rate": 0.0002, "epoch": 0.035413899955732624, "step": 40}, {"loss": 1.9547, "grad_norm": 0.4788398742675781, "learning_rate": 0.0002, "epoch": 0.04426737494466578, "step": 50}, {"loss": 1.8722, "grad_norm": 0.4763128161430359, "learning_rate": 0.0002, "epoch": 0.05312084993359894, "step": 60}, {"loss": 1.8632, "grad_norm": 0.5929698348045349, "learning_rate": 0.0002, "epoch": 0.0619743249225321, "step": 70}, {"loss": 1.9573, "grad_norm": 0.5899396538734436, "learning_rate": 0.0002, "epoch": 0.07082779991146525, "step": 80}, {"loss": 1.8308, "grad_norm": 0.460123747587204, "learning_rate": 0.0002, "epoch": 0.0796812749003984, "step": 90}, {"loss": 1.7615, "grad_norm": 0.4184812009334564, "learning_rate": 0.0002, "epoch": 0.08853474988933156, "step": 100}, {"loss": 1.8079, "grad_norm": 0.4051891267299652, "learning_rate": 0.0002, "epoch": 0.09738822487826472, "step": 110}, {"loss": 1.8911, "grad_norm": 0.3709661066532135, "learning_rate": 0.0002, "epoch": 0.10624169986719788, "step": 120}, {"loss": 1.8695, "grad_norm": 0.4783487915992737, "learning_rate": 0.0002, "epoch": 0.11509517485613104, "step": 130}, {"loss": 1.8602, "grad_norm": 0.36478137969970703, "learning_rate": 0.0002, "epoch": 0.1239486498450642, "step": 140}, {"loss": 1.7814, "grad_norm": 0.4005294442176819, "learning_rate": 0.0002, "epoch": 0.13280212483399734, "step": 150}, {"loss": 1.799, "grad_norm": 0.42357513308525085, "learning_rate": 0.0002, "epoch": 0.1416555998229305, "step": 160}, {"loss": 1.8835, "grad_norm": 0.3913971781730652, "learning_rate": 0.0002, "epoch": 0.15050907481186365, "step": 170}, {"loss": 1.8507, "grad_norm": 0.4650019407272339, "learning_rate": 0.0002, "epoch": 0.1593625498007968, "step": 180}, {"loss": 1.8036, "grad_norm": 0.5545958876609802, "learning_rate": 0.0002, "epoch": 0.16821602478972997, "step": 190}, {"loss": 1.8676, "grad_norm": 0.3669356107711792, "learning_rate": 0.0002, "epoch": 0.17706949977866313, "step": 200}, {"loss": 1.8169, "grad_norm": 0.3683622181415558, "learning_rate": 0.0002, "epoch": 0.18592297476759628, "step": 210}, {"loss": 1.8117, "grad_norm": 0.39825671911239624, "learning_rate": 0.0002, "epoch": 0.19477644975652944, "step": 220}, {"loss": 1.8332, "grad_norm": 0.4298318326473236, "learning_rate": 0.0002, "epoch": 0.2036299247454626, "step": 230}, {"loss": 1.8339, "grad_norm": 0.36111244559288025, "learning_rate": 0.0002, "epoch": 0.21248339973439576, "step": 240}, {"loss": 1.78, "grad_norm": 0.3711858093738556, "learning_rate": 0.0002, "epoch": 0.2213368747233289, "step": 250}, {"loss": 1.8643, "grad_norm": 0.37717559933662415, "learning_rate": 0.0002, "epoch": 0.23019034971226207, "step": 260}, {"loss": 1.7683, "grad_norm": 0.3678877651691437, "learning_rate": 0.0002, "epoch": 0.23904382470119523, "step": 270}, {"loss": 1.8235, "grad_norm": 0.4165912866592407, "learning_rate": 0.0002, "epoch": 0.2478972996901284, "step": 280}, {"loss": 1.8033, "grad_norm": 0.3403240740299225, "learning_rate": 0.0002, "epoch": 0.25675077467906154, "step": 290}, {"loss": 1.8704, "grad_norm": 0.4023234248161316, "learning_rate": 0.0002, "epoch": 0.2656042496679947, "step": 300}, {"loss": 1.7721, "grad_norm": 0.32472360134124756, "learning_rate": 0.0002, "epoch": 0.27445772465692786, "step": 310}, {"loss": 1.8544, "grad_norm": 0.36464595794677734, "learning_rate": 0.0002, "epoch": 0.283311199645861, "step": 320}, {"loss": 1.8168, "grad_norm": 0.3868598937988281, "learning_rate": 0.0002, "epoch": 0.2921646746347942, "step": 330}, {"loss": 1.772, "grad_norm": 0.3123539686203003, "learning_rate": 0.0002, "epoch": 0.3010181496237273, "step": 340}, {"loss": 1.8285, "grad_norm": 0.3392639458179474, "learning_rate": 0.0002, "epoch": 0.3098716246126605, "step": 350}, {"loss": 1.806, "grad_norm": 0.42070651054382324, "learning_rate": 0.0002, "epoch": 0.3187250996015936, "step": 360}, {"loss": 1.8319, "grad_norm": 0.3650900423526764, "learning_rate": 0.0002, "epoch": 0.3275785745905268, "step": 370}, {"loss": 1.8388, "grad_norm": 0.41388973593711853, "learning_rate": 0.0002, "epoch": 0.33643204957945994, "step": 380}, {"loss": 1.79, "grad_norm": 0.36625272035598755, "learning_rate": 0.0002, "epoch": 0.3452855245683931, "step": 390}, {"loss": 1.8271, "grad_norm": 0.3930284082889557, "learning_rate": 0.0002, "epoch": 0.35413899955732625, "step": 400}, {"loss": 1.8664, "grad_norm": 0.3415820300579071, "learning_rate": 0.0002, "epoch": 0.3629924745462594, "step": 410}, {"loss": 1.8885, "grad_norm": 0.4256570041179657, "learning_rate": 0.0002, "epoch": 0.37184594953519257, "step": 420}, {"loss": 1.7728, "grad_norm": 0.3740842938423157, "learning_rate": 0.0002, "epoch": 0.3806994245241257, "step": 430}, {"loss": 1.7676, "grad_norm": 0.334108829498291, "learning_rate": 0.0002, "epoch": 0.3895528995130589, "step": 440}, {"loss": 1.7837, "grad_norm": 0.33186739683151245, "learning_rate": 0.0002, "epoch": 0.398406374501992, "step": 450}, {"loss": 1.8885, "grad_norm": 0.39127954840660095, "learning_rate": 0.0002, "epoch": 0.4072598494909252, "step": 460}, {"loss": 1.8053, "grad_norm": 0.331443727016449, "learning_rate": 0.0002, "epoch": 0.4161133244798583, "step": 470}, {"loss": 1.783, "grad_norm": 0.36834150552749634, "learning_rate": 0.0002, "epoch": 0.4249667994687915, "step": 480}, {"loss": 1.7549, "grad_norm": 0.338123619556427, "learning_rate": 0.0002, "epoch": 0.43382027445772464, "step": 490}, {"loss": 1.795, "grad_norm": 0.3891060948371887, "learning_rate": 0.0002, "epoch": 0.4426737494466578, "step": 500}, {"loss": 1.7639, "grad_norm": 0.3486529290676117, "learning_rate": 0.0002, "epoch": 0.45152722443559096, "step": 510}, {"loss": 1.796, "grad_norm": 0.3635135889053345, "learning_rate": 0.0002, "epoch": 0.46038069942452414, "step": 520}, {"loss": 1.8068, "grad_norm": 0.7706693410873413, "learning_rate": 0.0002, "epoch": 0.4692341744134573, "step": 530}, {"loss": 1.8048, "grad_norm": 0.33725443482398987, "learning_rate": 0.0002, "epoch": 0.47808764940239046, "step": 540}, {"loss": 1.8023, "grad_norm": 0.3127504289150238, "learning_rate": 0.0002, "epoch": 0.4869411243913236, "step": 550}, {"loss": 1.7693, "grad_norm": 0.3527977466583252, "learning_rate": 0.0002, "epoch": 0.4957945993802568, "step": 560}, {"loss": 1.7989, "grad_norm": 0.3574548661708832, "learning_rate": 0.0002, "epoch": 0.5046480743691899, "step": 570}, {"loss": 1.7699, "grad_norm": 0.32787248492240906, "learning_rate": 0.0002, "epoch": 0.5135015493581231, "step": 580}, {"loss": 1.7502, "grad_norm": 0.3309430778026581, "learning_rate": 0.0002, "epoch": 0.5223550243470563, "step": 590}, {"loss": 1.7798, "grad_norm": 0.34276407957077026, "learning_rate": 0.0002, "epoch": 0.5312084993359893, "step": 600}, {"loss": 1.7517, "grad_norm": 0.3343711495399475, "learning_rate": 0.0002, "epoch": 0.5400619743249225, "step": 610}, {"loss": 1.7661, "grad_norm": 0.3193040192127228, "learning_rate": 0.0002, "epoch": 0.5489154493138557, "step": 620}, {"loss": 1.7769, "grad_norm": 0.3059828579425812, "learning_rate": 0.0002, "epoch": 0.5577689243027888, "step": 630}, {"loss": 1.8166, "grad_norm": 0.37237173318862915, "learning_rate": 0.0002, "epoch": 0.566622399291722, "step": 640}, {"loss": 1.7531, "grad_norm": 0.36022549867630005, "learning_rate": 0.0002, "epoch": 0.5754758742806552, "step": 650}, {"loss": 1.771, "grad_norm": 0.34974920749664307, "learning_rate": 0.0002, "epoch": 0.5843293492695883, "step": 660}, {"loss": 1.8226, "grad_norm": 0.37135401368141174, "learning_rate": 0.0002, "epoch": 0.5931828242585214, "step": 670}, {"loss": 1.7456, "grad_norm": 0.3385699689388275, "learning_rate": 0.0002, "epoch": 0.6020362992474546, "step": 680}, {"loss": 1.7696, "grad_norm": 0.36015814542770386, "learning_rate": 0.0002, "epoch": 0.6108897742363878, "step": 690}, {"loss": 1.7892, "grad_norm": 0.3503795564174652, "learning_rate": 0.0002, "epoch": 0.619743249225321, "step": 700}, {"loss": 1.7733, "grad_norm": 0.3447190225124359, "learning_rate": 0.0002, "epoch": 0.628596724214254, "step": 710}, {"loss": 1.794, "grad_norm": 0.3193499445915222, "learning_rate": 0.0002, "epoch": 0.6374501992031872, "step": 720}, {"loss": 1.8046, "grad_norm": 0.37058180570602417, "learning_rate": 0.0002, "epoch": 0.6463036741921204, "step": 730}, {"loss": 1.8391, "grad_norm": 0.42216411232948303, "learning_rate": 0.0002, "epoch": 0.6551571491810536, "step": 740}, {"loss": 1.7142, "grad_norm": 0.3091185688972473, "learning_rate": 0.0002, "epoch": 0.6640106241699867, "step": 750}, {"loss": 1.8624, "grad_norm": 0.33168601989746094, "learning_rate": 0.0002, "epoch": 0.6728640991589199, "step": 760}, {"loss": 1.7123, "grad_norm": 0.31269341707229614, "learning_rate": 0.0002, "epoch": 0.6817175741478531, "step": 770}, {"loss": 1.8526, "grad_norm": 0.36125293374061584, "learning_rate": 0.0002, "epoch": 0.6905710491367862, "step": 780}, {"loss": 1.7478, "grad_norm": 0.3145293593406677, "learning_rate": 0.0002, "epoch": 0.6994245241257193, "step": 790}, {"loss": 1.6545, "grad_norm": 0.3611990809440613, "learning_rate": 0.0002, "epoch": 0.7082779991146525, "step": 800}, {"loss": 1.892, "grad_norm": 0.3165971636772156, "learning_rate": 0.0002, "epoch": 0.7171314741035857, "step": 810}, {"loss": 1.8251, "grad_norm": 0.3364323675632477, "learning_rate": 0.0002, "epoch": 0.7259849490925188, "step": 820}, {"loss": 1.8508, "grad_norm": 0.4310600757598877, "learning_rate": 0.0002, "epoch": 0.734838424081452, "step": 830}, {"loss": 1.7816, "grad_norm": 0.3414389491081238, "learning_rate": 0.0002, "epoch": 0.7436918990703851, "step": 840}, {"loss": 1.8148, "grad_norm": 0.35536202788352966, "learning_rate": 0.0002, "epoch": 0.7525453740593183, "step": 850}, {"loss": 1.8241, "grad_norm": 0.3232460618019104, "learning_rate": 0.0002, "epoch": 0.7613988490482514, "step": 860}, {"loss": 1.7312, "grad_norm": 0.32734858989715576, "learning_rate": 0.0002, "epoch": 0.7702523240371846, "step": 870}, {"loss": 1.7241, "grad_norm": 0.3433493673801422, "learning_rate": 0.0002, "epoch": 0.7791057990261178, "step": 880}, {"loss": 1.7375, "grad_norm": 0.33354780077934265, "learning_rate": 0.0002, "epoch": 0.787959274015051, "step": 890}, {"loss": 1.7314, "grad_norm": 0.30728545784950256, "learning_rate": 0.0002, "epoch": 0.796812749003984, "step": 900}, {"loss": 1.8267, "grad_norm": 0.3373030126094818, "learning_rate": 0.0002, "epoch": 0.8056662239929172, "step": 910}, {"loss": 1.8479, "grad_norm": 0.3468782603740692, "learning_rate": 0.0002, "epoch": 0.8145196989818504, "step": 920}, {"loss": 1.8548, "grad_norm": 0.33520200848579407, "learning_rate": 0.0002, "epoch": 0.8233731739707836, "step": 930}, {"loss": 1.7932, "grad_norm": 0.35207098722457886, "learning_rate": 0.0002, "epoch": 0.8322266489597167, "step": 940}, {"loss": 1.7804, "grad_norm": 0.4000207483768463, "learning_rate": 0.0002, "epoch": 0.8410801239486498, "step": 950}, {"loss": 1.7996, "grad_norm": 0.35362836718559265, "learning_rate": 0.0002, "epoch": 0.849933598937583, "step": 960}, {"loss": 1.7497, "grad_norm": 0.3470745086669922, "learning_rate": 0.0002, "epoch": 0.8587870739265162, "step": 970}, {"loss": 1.8174, "grad_norm": 0.31602704524993896, "learning_rate": 0.0002, "epoch": 0.8676405489154493, "step": 980}, {"loss": 1.7734, "grad_norm": 0.3062942326068878, "learning_rate": 0.0002, "epoch": 0.8764940239043825, "step": 990}, {"loss": 1.7804, "grad_norm": 0.36963850259780884, "learning_rate": 0.0002, "epoch": 0.8853474988933157, "step": 1000}, {"loss": 1.7309, "grad_norm": 0.3384034037590027, "learning_rate": 0.0002, "epoch": 0.8942009738822487, "step": 1010}, {"loss": 1.7945, "grad_norm": 0.30436110496520996, "learning_rate": 0.0002, "epoch": 0.9030544488711819, "step": 1020}, {"loss": 1.7126, "grad_norm": 3.499784469604492, "learning_rate": 0.0002, "epoch": 0.9119079238601151, "step": 1030}, {"loss": 1.7847, "grad_norm": 0.3130280375480652, "learning_rate": 0.0002, "epoch": 0.9207613988490483, "step": 1040}, {"loss": 1.7527, "grad_norm": 0.29976674914360046, "learning_rate": 0.0002, "epoch": 0.9296148738379814, "step": 1050}, {"loss": 1.7753, "grad_norm": 0.35852617025375366, "learning_rate": 0.0002, "epoch": 0.9384683488269145, "step": 1060}, {"loss": 1.7507, "grad_norm": 0.3288591504096985, "learning_rate": 0.0002, "epoch": 0.9473218238158477, "step": 1070}, {"loss": 1.8155, "grad_norm": 0.32641634345054626, "learning_rate": 0.0002, "epoch": 0.9561752988047809, "step": 1080}, {"loss": 1.7912, "grad_norm": 0.3305715322494507, "learning_rate": 0.0002, "epoch": 0.965028773793714, "step": 1090}, {"loss": 1.8368, "grad_norm": 0.30650773644447327, "learning_rate": 0.0002, "epoch": 0.9738822487826472, "step": 1100}, {"loss": 1.6739, "grad_norm": 0.3330624997615814, "learning_rate": 0.0002, "epoch": 0.9827357237715804, "step": 1110}, {"loss": 1.8392, "grad_norm": 0.3173314034938812, "learning_rate": 0.0002, "epoch": 0.9915891987605135, "step": 1120}, {"eval_loss": 1.8095673322677612, "eval_runtime": 82.6312, "eval_samples_per_second": 6.233, "eval_steps_per_second": 0.787, "epoch": 0.9995573262505534, "step": 1129}, {"loss": 1.7997, "grad_norm": 0.3092995882034302, "learning_rate": 0.0002, "epoch": 1.0004426737494467, "step": 1130}, {"loss": 1.6958, "grad_norm": 0.34386494755744934, "learning_rate": 0.0002, "epoch": 1.0092961487383798, "step": 1140}, {"loss": 1.7149, "grad_norm": 0.2887897789478302, "learning_rate": 0.0002, "epoch": 1.0181496237273129, "step": 1150}, {"loss": 1.7377, "grad_norm": 0.3706893026828766, "learning_rate": 0.0002, "epoch": 1.0270030987162462, "step": 1160}, {"loss": 1.6604, "grad_norm": 0.34724316000938416, "learning_rate": 0.0002, "epoch": 1.0358565737051793, "step": 1170}, {"loss": 1.7749, "grad_norm": 0.41001757979393005, "learning_rate": 0.0002, "epoch": 1.0447100486941125, "step": 1180}, {"loss": 1.6332, "grad_norm": 0.34838348627090454, "learning_rate": 0.0002, "epoch": 1.0535635236830456, "step": 1190}, {"loss": 1.7416, "grad_norm": 0.37201181054115295, "learning_rate": 0.0002, "epoch": 1.0624169986719787, "step": 1200}, {"loss": 1.7707, "grad_norm": 0.36871352791786194, "learning_rate": 0.0002, "epoch": 1.071270473660912, "step": 1210}, {"loss": 1.6769, "grad_norm": 0.35687458515167236, "learning_rate": 0.0002, "epoch": 1.080123948649845, "step": 1220}, {"loss": 1.7235, "grad_norm": 0.3864741921424866, "learning_rate": 0.0002, "epoch": 1.0889774236387781, "step": 1230}, {"loss": 1.729, "grad_norm": 0.3496808707714081, "learning_rate": 0.0002, "epoch": 1.0978308986277114, "step": 1240}, {"loss": 1.7192, "grad_norm": 0.3444930911064148, "learning_rate": 0.0002, "epoch": 1.1066843736166445, "step": 1250}, {"loss": 1.6672, "grad_norm": 0.353188693523407, "learning_rate": 0.0002, "epoch": 1.1155378486055776, "step": 1260}, {"loss": 1.7634, "grad_norm": 0.3284400999546051, "learning_rate": 0.0002, "epoch": 1.1243913235945109, "step": 1270}, {"loss": 1.7441, "grad_norm": 0.3545348644256592, "learning_rate": 0.0002, "epoch": 1.133244798583444, "step": 1280}, {"loss": 1.7343, "grad_norm": 0.3489900529384613, "learning_rate": 0.0002, "epoch": 1.1420982735723773, "step": 1290}, {"loss": 1.6399, "grad_norm": 0.40355560183525085, "learning_rate": 0.0002, "epoch": 1.1509517485613103, "step": 1300}, {"loss": 1.7658, "grad_norm": 0.3369944095611572, "learning_rate": 0.0002, "epoch": 1.1598052235502434, "step": 1310}, {"loss": 1.7098, "grad_norm": 0.39141345024108887, "learning_rate": 0.0002, "epoch": 1.1686586985391767, "step": 1320}, {"loss": 1.6628, "grad_norm": 0.36518552899360657, "learning_rate": 0.0002, "epoch": 1.1775121735281098, "step": 1330}, {"loss": 1.6958, "grad_norm": 0.3730056583881378, "learning_rate": 0.0002, "epoch": 1.1863656485170428, "step": 1340}, {"loss": 1.7613, "grad_norm": 0.37711501121520996, "learning_rate": 0.0002, "epoch": 1.1952191235059761, "step": 1350}, {"loss": 1.6423, "grad_norm": 0.3627128005027771, "learning_rate": 0.0002, "epoch": 1.2040725984949092, "step": 1360}, {"loss": 1.7214, "grad_norm": 0.3458651006221771, "learning_rate": 0.0002, "epoch": 1.2129260734838425, "step": 1370}, {"loss": 1.6978, "grad_norm": 0.392395555973053, "learning_rate": 0.0002, "epoch": 1.2217795484727756, "step": 1380}, {"loss": 1.7785, "grad_norm": 0.3353286683559418, "learning_rate": 0.0002, "epoch": 1.2306330234617087, "step": 1390}, {"loss": 1.7019, "grad_norm": 0.9545007944107056, "learning_rate": 0.0002, "epoch": 1.239486498450642, "step": 1400}, {"loss": 1.725, "grad_norm": 0.37037935853004456, "learning_rate": 0.0002, "epoch": 1.248339973439575, "step": 1410}, {"loss": 1.6818, "grad_norm": 0.3831497132778168, "learning_rate": 0.0002, "epoch": 1.257193448428508, "step": 1420}, {"loss": 1.747, "grad_norm": 0.4633576273918152, "learning_rate": 0.0002, "epoch": 1.2660469234174414, "step": 1430}, {"loss": 1.6864, "grad_norm": 0.3690567910671234, "learning_rate": 0.0002, "epoch": 1.2749003984063745, "step": 1440}, {"loss": 1.767, "grad_norm": 0.33980098366737366, "learning_rate": 0.0002, "epoch": 1.2837538733953076, "step": 1450}, {"loss": 1.6989, "grad_norm": 0.3731277287006378, "learning_rate": 0.0002, "epoch": 1.2926073483842409, "step": 1460}, {"loss": 1.6801, "grad_norm": 0.3781551122665405, "learning_rate": 0.0002, "epoch": 1.301460823373174, "step": 1470}, {"loss": 1.7551, "grad_norm": 0.36511561274528503, "learning_rate": 0.0002, "epoch": 1.310314298362107, "step": 1480}, {"loss": 1.6629, "grad_norm": 0.3292245864868164, "learning_rate": 0.0002, "epoch": 1.3191677733510403, "step": 1490}, {"loss": 1.7098, "grad_norm": 0.38758566975593567, "learning_rate": 0.0002, "epoch": 1.3280212483399734, "step": 1500}, {"loss": 1.7364, "grad_norm": 0.3993414044380188, "learning_rate": 0.0002, "epoch": 1.3368747233289067, "step": 1510}, {"loss": 1.7202, "grad_norm": 0.35689303278923035, "learning_rate": 0.0002, "epoch": 1.3457281983178397, "step": 1520}, {"loss": 1.7082, "grad_norm": 0.41849321126937866, "learning_rate": 0.0002, "epoch": 1.354581673306773, "step": 1530}, {"loss": 1.7488, "grad_norm": 0.36752554774284363, "learning_rate": 0.0002, "epoch": 1.3634351482957061, "step": 1540}, {"loss": 1.7032, "grad_norm": 0.36915940046310425, "learning_rate": 0.0002, "epoch": 1.3722886232846392, "step": 1550}, {"loss": 1.6698, "grad_norm": 0.3656710386276245, "learning_rate": 0.0002, "epoch": 1.3811420982735725, "step": 1560}, {"loss": 1.7269, "grad_norm": 0.32055532932281494, "learning_rate": 0.0002, "epoch": 1.3899955732625056, "step": 1570}, {"loss": 1.8, "grad_norm": 0.35031241178512573, "learning_rate": 0.0002, "epoch": 1.3988490482514386, "step": 1580}, {"loss": 1.6667, "grad_norm": 0.44541189074516296, "learning_rate": 0.0002, "epoch": 1.407702523240372, "step": 1590}, {"loss": 1.8624, "grad_norm": 0.36922356486320496, "learning_rate": 0.0002, "epoch": 1.416555998229305, "step": 1600}, {"loss": 1.7011, "grad_norm": 0.3470565974712372, "learning_rate": 0.0002, "epoch": 1.425409473218238, "step": 1610}, {"loss": 1.6912, "grad_norm": 0.3743111193180084, "learning_rate": 0.0002, "epoch": 1.4342629482071714, "step": 1620}, {"loss": 1.752, "grad_norm": 0.3619250953197479, "learning_rate": 0.0002, "epoch": 1.4431164231961044, "step": 1630}, {"loss": 1.6919, "grad_norm": 0.4028145968914032, "learning_rate": 0.0002, "epoch": 1.4519698981850375, "step": 1640}, {"loss": 1.75, "grad_norm": 0.36065351963043213, "learning_rate": 0.0002, "epoch": 1.4608233731739708, "step": 1650}, {"loss": 1.8212, "grad_norm": 0.44304442405700684, "learning_rate": 0.0002, "epoch": 1.469676848162904, "step": 1660}, {"loss": 1.6691, "grad_norm": 0.35770007967948914, "learning_rate": 0.0002, "epoch": 1.478530323151837, "step": 1670}, {"loss": 1.7588, "grad_norm": 0.37584400177001953, "learning_rate": 0.0002, "epoch": 1.4873837981407703, "step": 1680}, {"loss": 1.63, "grad_norm": 0.37151241302490234, "learning_rate": 0.0002, "epoch": 1.4962372731297033, "step": 1690}, {"loss": 1.6675, "grad_norm": 0.36422812938690186, "learning_rate": 0.0002, "epoch": 1.5050907481186364, "step": 1700}, {"loss": 1.7045, "grad_norm": 0.3680015206336975, "learning_rate": 0.0002, "epoch": 1.5139442231075697, "step": 1710}, {"loss": 1.6917, "grad_norm": 0.3356926441192627, "learning_rate": 0.0002, "epoch": 1.522797698096503, "step": 1720}, {"loss": 1.7108, "grad_norm": 0.37887054681777954, "learning_rate": 0.0002, "epoch": 1.531651173085436, "step": 1730}, {"loss": 1.7001, "grad_norm": 0.37052762508392334, "learning_rate": 0.0002, "epoch": 1.5405046480743692, "step": 1740}, {"loss": 1.6677, "grad_norm": 0.333925724029541, "learning_rate": 0.0002, "epoch": 1.5493581230633025, "step": 1750}, {"loss": 1.7159, "grad_norm": 0.3722778558731079, "learning_rate": 0.0002, "epoch": 1.5582115980522355, "step": 1760}, {"loss": 1.6923, "grad_norm": 0.3331141173839569, "learning_rate": 0.0002, "epoch": 1.5670650730411686, "step": 1770}, {"loss": 1.7444, "grad_norm": 0.3670045733451843, "learning_rate": 0.0002, "epoch": 1.575918548030102, "step": 1780}, {"loss": 1.7092, "grad_norm": 0.3769885301589966, "learning_rate": 0.0002, "epoch": 1.584772023019035, "step": 1790}, {"loss": 1.6689, "grad_norm": 0.4266890287399292, "learning_rate": 0.0002, "epoch": 1.593625498007968, "step": 1800}, {"loss": 1.6859, "grad_norm": 0.37174347043037415, "learning_rate": 0.0002, "epoch": 1.6024789729969013, "step": 1810}, {"loss": 1.6793, "grad_norm": 0.3599846363067627, "learning_rate": 0.0002, "epoch": 1.6113324479858344, "step": 1820}, {"loss": 1.6836, "grad_norm": 0.3364820182323456, "learning_rate": 0.0002, "epoch": 1.6201859229747675, "step": 1830}, {"loss": 1.7278, "grad_norm": 0.3874799907207489, "learning_rate": 0.0002, "epoch": 1.6290393979637008, "step": 1840}, {"loss": 1.705, "grad_norm": 0.3706085681915283, "learning_rate": 0.0002, "epoch": 1.6378928729526339, "step": 1850}, {"loss": 1.6761, "grad_norm": 0.3997809886932373, "learning_rate": 0.0002, "epoch": 1.646746347941567, "step": 1860}, {"loss": 1.7983, "grad_norm": 0.4033166170120239, "learning_rate": 0.0002, "epoch": 1.6555998229305002, "step": 1870}, {"loss": 1.6518, "grad_norm": 0.3944370150566101, "learning_rate": 0.0002, "epoch": 1.6644532979194335, "step": 1880}, {"loss": 1.6017, "grad_norm": 0.3467825651168823, "learning_rate": 0.0002, "epoch": 1.6733067729083664, "step": 1890}, {"loss": 1.7462, "grad_norm": 0.35290950536727905, "learning_rate": 0.0002, "epoch": 1.6821602478972997, "step": 1900}, {"loss": 1.7634, "grad_norm": 0.3664521872997284, "learning_rate": 0.0002, "epoch": 1.691013722886233, "step": 1910}, {"loss": 1.7922, "grad_norm": 0.33863595128059387, "learning_rate": 0.0002, "epoch": 1.699867197875166, "step": 1920}, {"loss": 1.7048, "grad_norm": 0.34726113080978394, "learning_rate": 0.0002, "epoch": 1.7087206728640991, "step": 1930}, {"loss": 1.6664, "grad_norm": 0.35060688853263855, "learning_rate": 0.0002, "epoch": 1.7175741478530324, "step": 1940}, {"loss": 1.7577, "grad_norm": 0.33741647005081177, "learning_rate": 0.0002, "epoch": 1.7264276228419655, "step": 1950}, {"loss": 1.6971, "grad_norm": 0.36190304160118103, "learning_rate": 0.0002, "epoch": 1.7352810978308986, "step": 1960}, {"loss": 1.7238, "grad_norm": 0.3412845730781555, "learning_rate": 0.0002, "epoch": 1.7441345728198319, "step": 1970}, {"loss": 1.7038, "grad_norm": 0.3841935694217682, "learning_rate": 0.0002, "epoch": 1.752988047808765, "step": 1980}, {"loss": 1.7185, "grad_norm": 0.39062076807022095, "learning_rate": 0.0002, "epoch": 1.761841522797698, "step": 1990}, {"loss": 1.7346, "grad_norm": 0.3741697669029236, "learning_rate": 0.0002, "epoch": 1.7706949977866313, "step": 2000}, {"loss": 1.6864, "grad_norm": 0.4160231053829193, "learning_rate": 0.0002, "epoch": 1.7795484727755644, "step": 2010}, {"loss": 1.7572, "grad_norm": 0.3602111339569092, "learning_rate": 0.0002, "epoch": 1.7884019477644975, "step": 2020}, {"loss": 1.6139, "grad_norm": 0.36740878224372864, "learning_rate": 0.0002, "epoch": 1.7972554227534308, "step": 2030}, {"loss": 1.7043, "grad_norm": 0.419039249420166, "learning_rate": 0.0002, "epoch": 1.8061088977423638, "step": 2040}, {"loss": 1.7847, "grad_norm": 0.3511838912963867, "learning_rate": 0.0002, "epoch": 1.814962372731297, "step": 2050}, {"loss": 1.6477, "grad_norm": 0.3580166697502136, "learning_rate": 0.0002, "epoch": 1.8238158477202302, "step": 2060}, {"loss": 1.7562, "grad_norm": 0.40928223729133606, "learning_rate": 0.0002, "epoch": 1.8326693227091635, "step": 2070}, {"loss": 1.7356, "grad_norm": 0.37134310603141785, "learning_rate": 0.0002, "epoch": 1.8415227976980963, "step": 2080}, {"loss": 1.6829, "grad_norm": 0.3924112319946289, "learning_rate": 0.0002, "epoch": 1.8503762726870296, "step": 2090}, {"loss": 1.6785, "grad_norm": 0.3215042054653168, "learning_rate": 0.0002, "epoch": 1.859229747675963, "step": 2100}, {"loss": 1.6864, "grad_norm": 0.37674015760421753, "learning_rate": 0.0002, "epoch": 1.868083222664896, "step": 2110}, {"loss": 1.7313, "grad_norm": 0.370856374502182, "learning_rate": 0.0002, "epoch": 1.876936697653829, "step": 2120}, {"loss": 1.7163, "grad_norm": 0.35783782601356506, "learning_rate": 0.0002, "epoch": 1.8857901726427624, "step": 2130}, {"loss": 1.7655, "grad_norm": 0.39538058638572693, "learning_rate": 0.0002, "epoch": 1.8946436476316955, "step": 2140}, {"loss": 1.6614, "grad_norm": 0.36677780747413635, "learning_rate": 0.0002, "epoch": 1.9034971226206285, "step": 2150}, {"loss": 1.6959, "grad_norm": 0.39032700657844543, "learning_rate": 0.0002, "epoch": 1.9123505976095618, "step": 2160}, {"loss": 1.7643, "grad_norm": 0.39762043952941895, "learning_rate": 0.0002, "epoch": 1.921204072598495, "step": 2170}, {"loss": 1.6767, "grad_norm": 0.5400257110595703, "learning_rate": 0.0002, "epoch": 1.930057547587428, "step": 2180}, {"loss": 1.7262, "grad_norm": 0.3650212287902832, "learning_rate": 0.0002, "epoch": 1.9389110225763613, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3583165109157562, "learning_rate": 0.0002, "epoch": 1.9477644975652944, "step": 2200}, {"loss": 1.7241, "grad_norm": 0.4031282365322113, "learning_rate": 0.0002, "epoch": 1.9566179725542274, "step": 2210}, {"loss": 1.7617, "grad_norm": 0.3673221170902252, "learning_rate": 0.0002, "epoch": 1.9654714475431607, "step": 2220}, {"loss": 1.6862, "grad_norm": 0.3920327126979828, "learning_rate": 0.0002, "epoch": 1.9743249225320938, "step": 2230}, {"loss": 1.7192, "grad_norm": 0.4765491783618927, "learning_rate": 0.0002, "epoch": 1.9831783975210269, "step": 2240}, {"loss": 1.7759, "grad_norm": 0.38130584359169006, "learning_rate": 0.0002, "epoch": 1.9920318725099602, "step": 2250}, {"eval_loss": 1.8077166080474854, "eval_runtime": 82.8351, "eval_samples_per_second": 6.217, "eval_steps_per_second": 0.785, "epoch": 2.0, "step": 2259}, {"loss": 1.7081, "grad_norm": 0.34340235590934753, "learning_rate": 0.0002, "epoch": 2.0008853474988935, "step": 2260}, {"loss": 1.6815, "grad_norm": 0.3710762858390808, "learning_rate": 0.0002, "epoch": 2.0097388224878263, "step": 2270}, {"loss": 1.5828, "grad_norm": 0.35640114545822144, "learning_rate": 0.0002, "epoch": 2.0185922974767596, "step": 2280}, {"loss": 1.6322, "grad_norm": 0.45970189571380615, "learning_rate": 0.0002, "epoch": 2.027445772465693, "step": 2290}, {"loss": 1.5598, "grad_norm": 0.4256797134876251, "learning_rate": 0.0002, "epoch": 2.0362992474546258, "step": 2300}, {"loss": 1.6271, "grad_norm": 0.42421531677246094, "learning_rate": 0.0002, "epoch": 2.045152722443559, "step": 2310}, {"loss": 1.6117, "grad_norm": 0.4032478928565979, "learning_rate": 0.0002, "epoch": 2.0540061974324924, "step": 2320}, {"loss": 1.6389, "grad_norm": 0.4073623716831207, "learning_rate": 0.0002, "epoch": 2.062859672421425, "step": 2330}, {"loss": 1.6527, "grad_norm": 0.4845200777053833, "learning_rate": 0.0002, "epoch": 2.0717131474103585, "step": 2340}, {"loss": 1.5734, "grad_norm": 0.40578293800354004, "learning_rate": 0.0002, "epoch": 2.080566622399292, "step": 2350}, {"loss": 1.5853, "grad_norm": 0.4037284255027771, "learning_rate": 0.0002, "epoch": 2.089420097388225, "step": 2360}, {"loss": 1.6511, "grad_norm": 0.4717613160610199, "learning_rate": 0.0002, "epoch": 2.098273572377158, "step": 2370}, {"loss": 1.6273, "grad_norm": 0.42076411843299866, "learning_rate": 0.0002, "epoch": 2.1071270473660912, "step": 2380}, {"loss": 1.654, "grad_norm": 0.47799113392829895, "learning_rate": 0.0002, "epoch": 2.1159805223550245, "step": 2390}, {"loss": 1.5528, "grad_norm": 0.4253084063529968, "learning_rate": 0.0002, "epoch": 2.1248339973439574, "step": 2400}, {"loss": 1.6432, "grad_norm": 0.5023085474967957, "learning_rate": 0.0002, "epoch": 2.1336874723328907, "step": 2410}, {"loss": 1.5926, "grad_norm": 0.49162712693214417, "learning_rate": 0.0002, "epoch": 2.142540947321824, "step": 2420}, {"loss": 1.5779, "grad_norm": 0.39035019278526306, "learning_rate": 0.0002, "epoch": 2.151394422310757, "step": 2430}, {"loss": 1.7526, "grad_norm": 0.43223854899406433, "learning_rate": 0.0002, "epoch": 2.16024789729969, "step": 2440}, {"loss": 1.6334, "grad_norm": 0.4596616327762604, "learning_rate": 0.0002, "epoch": 2.1691013722886234, "step": 2450}, {"loss": 1.6067, "grad_norm": 0.4469447731971741, "learning_rate": 0.0002, "epoch": 2.1779548472775563, "step": 2460}, {"loss": 1.5806, "grad_norm": 0.5100595355033875, "learning_rate": 0.0002, "epoch": 2.1868083222664896, "step": 2470}, {"loss": 1.6456, "grad_norm": 0.4169430732727051, "learning_rate": 0.0002, "epoch": 2.195661797255423, "step": 2480}, {"loss": 1.6734, "grad_norm": 0.4699254035949707, "learning_rate": 0.0002, "epoch": 2.2045152722443557, "step": 2490}, {"loss": 1.6259, "grad_norm": 0.43524250388145447, "learning_rate": 0.0002, "epoch": 2.213368747233289, "step": 2500}, {"loss": 1.6717, "grad_norm": 0.4496648907661438, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2510}, {"loss": 1.6735, "grad_norm": 0.43408212065696716, "learning_rate": 0.0002, "epoch": 2.231075697211155, "step": 2520}, {"loss": 1.611, "grad_norm": 0.4596034288406372, "learning_rate": 0.0002, "epoch": 2.2399291722000885, "step": 2530}, {"loss": 1.6271, "grad_norm": 0.5217021107673645, "learning_rate": 0.0002, "epoch": 2.2487826471890218, "step": 2540}, {"loss": 1.6027, "grad_norm": 0.44745638966560364, "learning_rate": 0.0002, "epoch": 2.2576361221779546, "step": 2550}, {"loss": 1.675, "grad_norm": 0.4484798014163971, "learning_rate": 0.0002, "epoch": 2.266489597166888, "step": 2560}, {"loss": 1.5321, "grad_norm": 0.4428067207336426, "learning_rate": 0.0002, "epoch": 2.275343072155821, "step": 2570}, {"loss": 1.6716, "grad_norm": 0.5095171332359314, "learning_rate": 0.0002, "epoch": 2.2841965471447545, "step": 2580}, {"loss": 1.5661, "grad_norm": 0.44833096861839294, "learning_rate": 0.0002, "epoch": 2.2930500221336874, "step": 2590}, {"loss": 1.652, "grad_norm": 0.507905900478363, "learning_rate": 0.0002, "epoch": 2.3019034971226207, "step": 2600}, {"loss": 1.5963, "grad_norm": 0.40808171033859253, "learning_rate": 0.0002, "epoch": 2.310756972111554, "step": 2610}, {"loss": 1.6574, "grad_norm": 0.4684814214706421, "learning_rate": 0.0002, "epoch": 2.319610447100487, "step": 2620}, {"loss": 1.587, "grad_norm": 0.44864922761917114, "learning_rate": 0.0002, "epoch": 2.32846392208942, "step": 2630}, {"loss": 1.5828, "grad_norm": 0.4174162745475769, "learning_rate": 0.0002, "epoch": 2.3373173970783534, "step": 2640}, {"loss": 1.642, "grad_norm": 0.42314743995666504, "learning_rate": 0.0002, "epoch": 2.3461708720672863, "step": 2650}, {"loss": 1.5884, "grad_norm": 0.49224185943603516, "learning_rate": 0.0002, "epoch": 2.3550243470562195, "step": 2660}, {"loss": 1.5766, "grad_norm": 0.45190292596817017, "learning_rate": 0.0002, "epoch": 2.363877822045153, "step": 2670}, {"loss": 1.6284, "grad_norm": 0.41817107796669006, "learning_rate": 0.0002, "epoch": 2.3727312970340857, "step": 2680}, {"loss": 1.6356, "grad_norm": 0.6436763405799866, "learning_rate": 0.0002, "epoch": 2.381584772023019, "step": 2690}, {"loss": 1.5915, "grad_norm": 0.47175949811935425, "learning_rate": 0.0002, "epoch": 2.3904382470119523, "step": 2700}, {"loss": 1.6303, "grad_norm": 0.480339378118515, "learning_rate": 0.0002, "epoch": 2.3992917220008856, "step": 2710}, {"loss": 1.5697, "grad_norm": 0.4723486006259918, "learning_rate": 0.0002, "epoch": 2.4081451969898184, "step": 2720}, {"loss": 1.54, "grad_norm": 0.4305492043495178, "learning_rate": 0.0002, "epoch": 2.4169986719787517, "step": 2730}, {"loss": 1.71, "grad_norm": 0.5007492303848267, "learning_rate": 0.0002, "epoch": 2.425852146967685, "step": 2740}, {"loss": 1.5369, "grad_norm": 0.5374062061309814, "learning_rate": 0.0002, "epoch": 2.434705621956618, "step": 2750}, {"loss": 1.6156, "grad_norm": 0.45866212248802185, "learning_rate": 0.0002, "epoch": 2.443559096945551, "step": 2760}, {"loss": 1.6066, "grad_norm": 0.47914502024650574, "learning_rate": 0.0002, "epoch": 2.4524125719344845, "step": 2770}, {"loss": 1.5644, "grad_norm": 0.43804746866226196, "learning_rate": 0.0002, "epoch": 2.4612660469234173, "step": 2780}, {"loss": 1.5952, "grad_norm": 0.43656906485557556, "learning_rate": 0.0002, "epoch": 2.4701195219123506, "step": 2790}, {"loss": 1.6311, "grad_norm": 0.4820363521575928, "learning_rate": 0.0002, "epoch": 2.478972996901284, "step": 2800}, {"loss": 1.5375, "grad_norm": 0.4916800558567047, "learning_rate": 0.0002, "epoch": 2.4878264718902168, "step": 2810}, {"loss": 1.5736, "grad_norm": 0.4521256983280182, "learning_rate": 0.0002, "epoch": 2.49667994687915, "step": 2820}, {"loss": 1.6179, "grad_norm": 0.5066806674003601, "learning_rate": 0.0002, "epoch": 2.5055334218680834, "step": 2830}, {"loss": 1.5812, "grad_norm": 0.4768151640892029, "learning_rate": 0.0002, "epoch": 2.514386896857016, "step": 2840}, {"loss": 1.6719, "grad_norm": 0.5144683718681335, "learning_rate": 0.0002, "epoch": 2.5232403718459495, "step": 2850}, {"loss": 1.6063, "grad_norm": 0.4718942940235138, "learning_rate": 0.0002, "epoch": 2.532093846834883, "step": 2860}, {"loss": 1.6099, "grad_norm": 0.4924587309360504, "learning_rate": 0.0002, "epoch": 2.5409473218238157, "step": 2870}, {"loss": 1.5994, "grad_norm": 0.4649953842163086, "learning_rate": 0.0002, "epoch": 2.549800796812749, "step": 2880}, {"loss": 1.6501, "grad_norm": 0.4836665987968445, "learning_rate": 0.0002, "epoch": 2.5586542718016823, "step": 2890}, {"loss": 1.6518, "grad_norm": 0.4162124991416931, "learning_rate": 0.0002, "epoch": 2.567507746790615, "step": 2900}, {"loss": 1.6471, "grad_norm": 0.4894537925720215, "learning_rate": 0.0002, "epoch": 2.5763612217795484, "step": 2910}, {"loss": 1.6123, "grad_norm": 0.4539397358894348, "learning_rate": 0.0002, "epoch": 2.5852146967684817, "step": 2920}, {"loss": 1.6449, "grad_norm": 0.4718773066997528, "learning_rate": 0.0002, "epoch": 2.5940681717574146, "step": 2930}, {"loss": 1.584, "grad_norm": 0.49989837408065796, "learning_rate": 0.0002, "epoch": 2.602921646746348, "step": 2940}, {"loss": 1.6087, "grad_norm": 0.4862406849861145, "learning_rate": 0.0002, "epoch": 2.611775121735281, "step": 2950}, {"loss": 1.6057, "grad_norm": 0.4244804382324219, "learning_rate": 0.0002, "epoch": 2.620628596724214, "step": 2960}, {"loss": 1.7795, "grad_norm": 0.49304354190826416, "learning_rate": 0.0002, "epoch": 2.6294820717131473, "step": 2970}, {"loss": 1.7255, "grad_norm": 0.4818236529827118, "learning_rate": 0.0002, "epoch": 2.6383355467020806, "step": 2980}, {"loss": 1.621, "grad_norm": 0.5077425837516785, "learning_rate": 0.0002, "epoch": 2.647189021691014, "step": 2990}, {"loss": 1.7064, "grad_norm": 0.4494157135486603, "learning_rate": 0.0002, "epoch": 2.6560424966799467, "step": 3000}, {"loss": 1.6792, "grad_norm": 0.4790278971195221, "learning_rate": 0.0002, "epoch": 2.66489597166888, "step": 3010}, {"loss": 1.6082, "grad_norm": 0.4702624976634979, "learning_rate": 0.0002, "epoch": 2.6737494466578133, "step": 3020}, {"loss": 1.6494, "grad_norm": 0.5082133412361145, "learning_rate": 0.0002, "epoch": 2.682602921646746, "step": 3030}, {"loss": 1.6438, "grad_norm": 0.4553256630897522, "learning_rate": 0.0002, "epoch": 2.6914563966356795, "step": 3040}, {"loss": 1.6155, "grad_norm": 0.4492715001106262, "learning_rate": 0.0002, "epoch": 2.700309871624613, "step": 3050}, {"loss": 1.5367, "grad_norm": 0.4555944502353668, "learning_rate": 0.0002, "epoch": 2.709163346613546, "step": 3060}, {"loss": 1.5793, "grad_norm": 0.5879693031311035, "learning_rate": 0.0002, "epoch": 2.718016821602479, "step": 3070}, {"loss": 1.6357, "grad_norm": 0.4628562927246094, "learning_rate": 0.0002, "epoch": 2.7268702965914122, "step": 3080}, {"loss": 1.6585, "grad_norm": 0.5169575810432434, "learning_rate": 0.0002, "epoch": 2.7357237715803455, "step": 3090}, {"loss": 1.562, "grad_norm": 0.4630090892314911, "learning_rate": 0.0002, "epoch": 2.7445772465692784, "step": 3100}, {"loss": 1.5508, "grad_norm": 0.5437219738960266, "learning_rate": 0.0002, "epoch": 2.7534307215582117, "step": 3110}, {"loss": 1.6442, "grad_norm": 0.5102152228355408, "learning_rate": 0.0002, "epoch": 2.762284196547145, "step": 3120}, {"loss": 1.5448, "grad_norm": 0.48287826776504517, "learning_rate": 0.0002, "epoch": 2.771137671536078, "step": 3130}, {"loss": 1.6657, "grad_norm": 0.4671737253665924, "learning_rate": 0.0002, "epoch": 2.779991146525011, "step": 3140}, {"loss": 1.5864, "grad_norm": 0.5177035331726074, "learning_rate": 0.0002, "epoch": 2.7888446215139444, "step": 3150}, {"loss": 1.5617, "grad_norm": 0.450989305973053, "learning_rate": 0.0002, "epoch": 2.7976980965028773, "step": 3160}, {"loss": 1.597, "grad_norm": 0.45007848739624023, "learning_rate": 0.0002, "epoch": 2.8065515714918106, "step": 3170}, {"loss": 1.7179, "grad_norm": 0.4600294530391693, "learning_rate": 0.0002, "epoch": 2.815405046480744, "step": 3180}, {"loss": 1.6441, "grad_norm": 0.485628604888916, "learning_rate": 0.0002, "epoch": 2.8242585214696767, "step": 3190}, {"loss": 1.6396, "grad_norm": 0.49811574816703796, "learning_rate": 0.0002, "epoch": 2.83311199645861, "step": 3200}, {"loss": 1.6067, "grad_norm": 0.5012516975402832, "learning_rate": 0.0002, "epoch": 2.8419654714475433, "step": 3210}, {"loss": 1.6188, "grad_norm": 0.4552757740020752, "learning_rate": 0.0002, "epoch": 2.850818946436476, "step": 3220}, {"loss": 1.5993, "grad_norm": 0.4539635479450226, "learning_rate": 0.0002, "epoch": 2.8596724214254094, "step": 3230}, {"loss": 1.5957, "grad_norm": 0.5534685850143433, "learning_rate": 0.0002, "epoch": 2.8685258964143427, "step": 3240}, {"loss": 1.6065, "grad_norm": 0.4570811688899994, "learning_rate": 0.0002, "epoch": 2.8773793714032756, "step": 3250}, {"loss": 1.6016, "grad_norm": 0.48181653022766113, "learning_rate": 0.0002, "epoch": 2.886232846392209, "step": 3260}, {"loss": 1.6574, "grad_norm": 0.4871032238006592, "learning_rate": 0.0002, "epoch": 2.895086321381142, "step": 3270}, {"loss": 1.5626, "grad_norm": 0.4643239676952362, "learning_rate": 0.0002, "epoch": 2.903939796370075, "step": 3280}, {"loss": 1.5981, "grad_norm": 0.5024484395980835, "learning_rate": 0.0002, "epoch": 2.9127932713590083, "step": 3290}, {"loss": 1.5756, "grad_norm": 0.4425384998321533, "learning_rate": 0.0002, "epoch": 2.9216467463479416, "step": 3300}, {"loss": 1.644, "grad_norm": 0.459168016910553, "learning_rate": 0.0002, "epoch": 2.9305002213368745, "step": 3310}, {"loss": 1.6404, "grad_norm": 0.4950717091560364, "learning_rate": 0.0002, "epoch": 2.939353696325808, "step": 3320}, {"loss": 1.652, "grad_norm": 0.4516230523586273, "learning_rate": 0.0002, "epoch": 2.948207171314741, "step": 3330}, {"loss": 1.5917, "grad_norm": 0.49523285031318665, "learning_rate": 0.0002, "epoch": 2.957060646303674, "step": 3340}, {"loss": 1.733, "grad_norm": 0.49282631278038025, "learning_rate": 0.0002, "epoch": 2.9659141212926072, "step": 3350}, {"loss": 1.6519, "grad_norm": 0.45825016498565674, "learning_rate": 0.0002, "epoch": 2.9747675962815405, "step": 3360}, {"loss": 1.6607, "grad_norm": 0.4952891170978546, "learning_rate": 0.0002, "epoch": 2.983621071270474, "step": 3370}, {"loss": 1.5981, "grad_norm": 0.42182639241218567, "learning_rate": 0.0002, "epoch": 2.9924745462594067, "step": 3380}]} +{"epoch": 4.0, "step": 4518, "epoch_duration": 2964.9641728401184, "total_accumulated_duration": 11765.806130886078, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4916, "grad_norm": 0.4775333106517792, "learning_rate": 0.0002, "epoch": 0.008853474988933156, "step": 10}, {"loss": 2.3137, "grad_norm": 0.5485824346542358, "learning_rate": 0.0002, "epoch": 0.017706949977866312, "step": 20}, {"loss": 2.0984, "grad_norm": 0.5675218105316162, "learning_rate": 0.0002, "epoch": 0.02656042496679947, "step": 30}, {"loss": 2.0622, "grad_norm": 0.696494460105896, "learning_rate": 0.0002, "epoch": 0.035413899955732624, "step": 40}, {"loss": 1.9547, "grad_norm": 0.4788398742675781, "learning_rate": 0.0002, "epoch": 0.04426737494466578, "step": 50}, {"loss": 1.8722, "grad_norm": 0.4763128161430359, "learning_rate": 0.0002, "epoch": 0.05312084993359894, "step": 60}, {"loss": 1.8632, "grad_norm": 0.5929698348045349, "learning_rate": 0.0002, "epoch": 0.0619743249225321, "step": 70}, {"loss": 1.9573, "grad_norm": 0.5899396538734436, "learning_rate": 0.0002, "epoch": 0.07082779991146525, "step": 80}, {"loss": 1.8308, "grad_norm": 0.460123747587204, "learning_rate": 0.0002, "epoch": 0.0796812749003984, "step": 90}, {"loss": 1.7615, "grad_norm": 0.4184812009334564, "learning_rate": 0.0002, "epoch": 0.08853474988933156, "step": 100}, {"loss": 1.8079, "grad_norm": 0.4051891267299652, "learning_rate": 0.0002, "epoch": 0.09738822487826472, "step": 110}, {"loss": 1.8911, "grad_norm": 0.3709661066532135, "learning_rate": 0.0002, "epoch": 0.10624169986719788, "step": 120}, {"loss": 1.8695, "grad_norm": 0.4783487915992737, "learning_rate": 0.0002, "epoch": 0.11509517485613104, "step": 130}, {"loss": 1.8602, "grad_norm": 0.36478137969970703, "learning_rate": 0.0002, "epoch": 0.1239486498450642, "step": 140}, {"loss": 1.7814, "grad_norm": 0.4005294442176819, "learning_rate": 0.0002, "epoch": 0.13280212483399734, "step": 150}, {"loss": 1.799, "grad_norm": 0.42357513308525085, "learning_rate": 0.0002, "epoch": 0.1416555998229305, "step": 160}, {"loss": 1.8835, "grad_norm": 0.3913971781730652, "learning_rate": 0.0002, "epoch": 0.15050907481186365, "step": 170}, {"loss": 1.8507, "grad_norm": 0.4650019407272339, "learning_rate": 0.0002, "epoch": 0.1593625498007968, "step": 180}, {"loss": 1.8036, "grad_norm": 0.5545958876609802, "learning_rate": 0.0002, "epoch": 0.16821602478972997, "step": 190}, {"loss": 1.8676, "grad_norm": 0.3669356107711792, "learning_rate": 0.0002, "epoch": 0.17706949977866313, "step": 200}, {"loss": 1.8169, "grad_norm": 0.3683622181415558, "learning_rate": 0.0002, "epoch": 0.18592297476759628, "step": 210}, {"loss": 1.8117, "grad_norm": 0.39825671911239624, "learning_rate": 0.0002, "epoch": 0.19477644975652944, "step": 220}, {"loss": 1.8332, "grad_norm": 0.4298318326473236, "learning_rate": 0.0002, "epoch": 0.2036299247454626, "step": 230}, {"loss": 1.8339, "grad_norm": 0.36111244559288025, "learning_rate": 0.0002, "epoch": 0.21248339973439576, "step": 240}, {"loss": 1.78, "grad_norm": 0.3711858093738556, "learning_rate": 0.0002, "epoch": 0.2213368747233289, "step": 250}, {"loss": 1.8643, "grad_norm": 0.37717559933662415, "learning_rate": 0.0002, "epoch": 0.23019034971226207, "step": 260}, {"loss": 1.7683, "grad_norm": 0.3678877651691437, "learning_rate": 0.0002, "epoch": 0.23904382470119523, "step": 270}, {"loss": 1.8235, "grad_norm": 0.4165912866592407, "learning_rate": 0.0002, "epoch": 0.2478972996901284, "step": 280}, {"loss": 1.8033, "grad_norm": 0.3403240740299225, "learning_rate": 0.0002, "epoch": 0.25675077467906154, "step": 290}, {"loss": 1.8704, "grad_norm": 0.4023234248161316, "learning_rate": 0.0002, "epoch": 0.2656042496679947, "step": 300}, {"loss": 1.7721, "grad_norm": 0.32472360134124756, "learning_rate": 0.0002, "epoch": 0.27445772465692786, "step": 310}, {"loss": 1.8544, "grad_norm": 0.36464595794677734, "learning_rate": 0.0002, "epoch": 0.283311199645861, "step": 320}, {"loss": 1.8168, "grad_norm": 0.3868598937988281, "learning_rate": 0.0002, "epoch": 0.2921646746347942, "step": 330}, {"loss": 1.772, "grad_norm": 0.3123539686203003, "learning_rate": 0.0002, "epoch": 0.3010181496237273, "step": 340}, {"loss": 1.8285, "grad_norm": 0.3392639458179474, "learning_rate": 0.0002, "epoch": 0.3098716246126605, "step": 350}, {"loss": 1.806, "grad_norm": 0.42070651054382324, "learning_rate": 0.0002, "epoch": 0.3187250996015936, "step": 360}, {"loss": 1.8319, "grad_norm": 0.3650900423526764, "learning_rate": 0.0002, "epoch": 0.3275785745905268, "step": 370}, {"loss": 1.8388, "grad_norm": 0.41388973593711853, "learning_rate": 0.0002, "epoch": 0.33643204957945994, "step": 380}, {"loss": 1.79, "grad_norm": 0.36625272035598755, "learning_rate": 0.0002, "epoch": 0.3452855245683931, "step": 390}, {"loss": 1.8271, "grad_norm": 0.3930284082889557, "learning_rate": 0.0002, "epoch": 0.35413899955732625, "step": 400}, {"loss": 1.8664, "grad_norm": 0.3415820300579071, "learning_rate": 0.0002, "epoch": 0.3629924745462594, "step": 410}, {"loss": 1.8885, "grad_norm": 0.4256570041179657, "learning_rate": 0.0002, "epoch": 0.37184594953519257, "step": 420}, {"loss": 1.7728, "grad_norm": 0.3740842938423157, "learning_rate": 0.0002, "epoch": 0.3806994245241257, "step": 430}, {"loss": 1.7676, "grad_norm": 0.334108829498291, "learning_rate": 0.0002, "epoch": 0.3895528995130589, "step": 440}, {"loss": 1.7837, "grad_norm": 0.33186739683151245, "learning_rate": 0.0002, "epoch": 0.398406374501992, "step": 450}, {"loss": 1.8885, "grad_norm": 0.39127954840660095, "learning_rate": 0.0002, "epoch": 0.4072598494909252, "step": 460}, {"loss": 1.8053, "grad_norm": 0.331443727016449, "learning_rate": 0.0002, "epoch": 0.4161133244798583, "step": 470}, {"loss": 1.783, "grad_norm": 0.36834150552749634, "learning_rate": 0.0002, "epoch": 0.4249667994687915, "step": 480}, {"loss": 1.7549, "grad_norm": 0.338123619556427, "learning_rate": 0.0002, "epoch": 0.43382027445772464, "step": 490}, {"loss": 1.795, "grad_norm": 0.3891060948371887, "learning_rate": 0.0002, "epoch": 0.4426737494466578, "step": 500}, {"loss": 1.7639, "grad_norm": 0.3486529290676117, "learning_rate": 0.0002, "epoch": 0.45152722443559096, "step": 510}, {"loss": 1.796, "grad_norm": 0.3635135889053345, "learning_rate": 0.0002, "epoch": 0.46038069942452414, "step": 520}, {"loss": 1.8068, "grad_norm": 0.7706693410873413, "learning_rate": 0.0002, "epoch": 0.4692341744134573, "step": 530}, {"loss": 1.8048, "grad_norm": 0.33725443482398987, "learning_rate": 0.0002, "epoch": 0.47808764940239046, "step": 540}, {"loss": 1.8023, "grad_norm": 0.3127504289150238, "learning_rate": 0.0002, "epoch": 0.4869411243913236, "step": 550}, {"loss": 1.7693, "grad_norm": 0.3527977466583252, "learning_rate": 0.0002, "epoch": 0.4957945993802568, "step": 560}, {"loss": 1.7989, "grad_norm": 0.3574548661708832, "learning_rate": 0.0002, "epoch": 0.5046480743691899, "step": 570}, {"loss": 1.7699, "grad_norm": 0.32787248492240906, "learning_rate": 0.0002, "epoch": 0.5135015493581231, "step": 580}, {"loss": 1.7502, "grad_norm": 0.3309430778026581, "learning_rate": 0.0002, "epoch": 0.5223550243470563, "step": 590}, {"loss": 1.7798, "grad_norm": 0.34276407957077026, "learning_rate": 0.0002, "epoch": 0.5312084993359893, "step": 600}, {"loss": 1.7517, "grad_norm": 0.3343711495399475, "learning_rate": 0.0002, "epoch": 0.5400619743249225, "step": 610}, {"loss": 1.7661, "grad_norm": 0.3193040192127228, "learning_rate": 0.0002, "epoch": 0.5489154493138557, "step": 620}, {"loss": 1.7769, "grad_norm": 0.3059828579425812, "learning_rate": 0.0002, "epoch": 0.5577689243027888, "step": 630}, {"loss": 1.8166, "grad_norm": 0.37237173318862915, "learning_rate": 0.0002, "epoch": 0.566622399291722, "step": 640}, {"loss": 1.7531, "grad_norm": 0.36022549867630005, "learning_rate": 0.0002, "epoch": 0.5754758742806552, "step": 650}, {"loss": 1.771, "grad_norm": 0.34974920749664307, "learning_rate": 0.0002, "epoch": 0.5843293492695883, "step": 660}, {"loss": 1.8226, "grad_norm": 0.37135401368141174, "learning_rate": 0.0002, "epoch": 0.5931828242585214, "step": 670}, {"loss": 1.7456, "grad_norm": 0.3385699689388275, "learning_rate": 0.0002, "epoch": 0.6020362992474546, "step": 680}, {"loss": 1.7696, "grad_norm": 0.36015814542770386, "learning_rate": 0.0002, "epoch": 0.6108897742363878, "step": 690}, {"loss": 1.7892, "grad_norm": 0.3503795564174652, "learning_rate": 0.0002, "epoch": 0.619743249225321, "step": 700}, {"loss": 1.7733, "grad_norm": 0.3447190225124359, "learning_rate": 0.0002, "epoch": 0.628596724214254, "step": 710}, {"loss": 1.794, "grad_norm": 0.3193499445915222, "learning_rate": 0.0002, "epoch": 0.6374501992031872, "step": 720}, {"loss": 1.8046, "grad_norm": 0.37058180570602417, "learning_rate": 0.0002, "epoch": 0.6463036741921204, "step": 730}, {"loss": 1.8391, "grad_norm": 0.42216411232948303, "learning_rate": 0.0002, "epoch": 0.6551571491810536, "step": 740}, {"loss": 1.7142, "grad_norm": 0.3091185688972473, "learning_rate": 0.0002, "epoch": 0.6640106241699867, "step": 750}, {"loss": 1.8624, "grad_norm": 0.33168601989746094, "learning_rate": 0.0002, "epoch": 0.6728640991589199, "step": 760}, {"loss": 1.7123, "grad_norm": 0.31269341707229614, "learning_rate": 0.0002, "epoch": 0.6817175741478531, "step": 770}, {"loss": 1.8526, "grad_norm": 0.36125293374061584, "learning_rate": 0.0002, "epoch": 0.6905710491367862, "step": 780}, {"loss": 1.7478, "grad_norm": 0.3145293593406677, "learning_rate": 0.0002, "epoch": 0.6994245241257193, "step": 790}, {"loss": 1.6545, "grad_norm": 0.3611990809440613, "learning_rate": 0.0002, "epoch": 0.7082779991146525, "step": 800}, {"loss": 1.892, "grad_norm": 0.3165971636772156, "learning_rate": 0.0002, "epoch": 0.7171314741035857, "step": 810}, {"loss": 1.8251, "grad_norm": 0.3364323675632477, "learning_rate": 0.0002, "epoch": 0.7259849490925188, "step": 820}, {"loss": 1.8508, "grad_norm": 0.4310600757598877, "learning_rate": 0.0002, "epoch": 0.734838424081452, "step": 830}, {"loss": 1.7816, "grad_norm": 0.3414389491081238, "learning_rate": 0.0002, "epoch": 0.7436918990703851, "step": 840}, {"loss": 1.8148, "grad_norm": 0.35536202788352966, "learning_rate": 0.0002, "epoch": 0.7525453740593183, "step": 850}, {"loss": 1.8241, "grad_norm": 0.3232460618019104, "learning_rate": 0.0002, "epoch": 0.7613988490482514, "step": 860}, {"loss": 1.7312, "grad_norm": 0.32734858989715576, "learning_rate": 0.0002, "epoch": 0.7702523240371846, "step": 870}, {"loss": 1.7241, "grad_norm": 0.3433493673801422, "learning_rate": 0.0002, "epoch": 0.7791057990261178, "step": 880}, {"loss": 1.7375, "grad_norm": 0.33354780077934265, "learning_rate": 0.0002, "epoch": 0.787959274015051, "step": 890}, {"loss": 1.7314, "grad_norm": 0.30728545784950256, "learning_rate": 0.0002, "epoch": 0.796812749003984, "step": 900}, {"loss": 1.8267, "grad_norm": 0.3373030126094818, "learning_rate": 0.0002, "epoch": 0.8056662239929172, "step": 910}, {"loss": 1.8479, "grad_norm": 0.3468782603740692, "learning_rate": 0.0002, "epoch": 0.8145196989818504, "step": 920}, {"loss": 1.8548, "grad_norm": 0.33520200848579407, "learning_rate": 0.0002, "epoch": 0.8233731739707836, "step": 930}, {"loss": 1.7932, "grad_norm": 0.35207098722457886, "learning_rate": 0.0002, "epoch": 0.8322266489597167, "step": 940}, {"loss": 1.7804, "grad_norm": 0.4000207483768463, "learning_rate": 0.0002, "epoch": 0.8410801239486498, "step": 950}, {"loss": 1.7996, "grad_norm": 0.35362836718559265, "learning_rate": 0.0002, "epoch": 0.849933598937583, "step": 960}, {"loss": 1.7497, "grad_norm": 0.3470745086669922, "learning_rate": 0.0002, "epoch": 0.8587870739265162, "step": 970}, {"loss": 1.8174, "grad_norm": 0.31602704524993896, "learning_rate": 0.0002, "epoch": 0.8676405489154493, "step": 980}, {"loss": 1.7734, "grad_norm": 0.3062942326068878, "learning_rate": 0.0002, "epoch": 0.8764940239043825, "step": 990}, {"loss": 1.7804, "grad_norm": 0.36963850259780884, "learning_rate": 0.0002, "epoch": 0.8853474988933157, "step": 1000}, {"loss": 1.7309, "grad_norm": 0.3384034037590027, "learning_rate": 0.0002, "epoch": 0.8942009738822487, "step": 1010}, {"loss": 1.7945, "grad_norm": 0.30436110496520996, "learning_rate": 0.0002, "epoch": 0.9030544488711819, "step": 1020}, {"loss": 1.7126, "grad_norm": 3.499784469604492, "learning_rate": 0.0002, "epoch": 0.9119079238601151, "step": 1030}, {"loss": 1.7847, "grad_norm": 0.3130280375480652, "learning_rate": 0.0002, "epoch": 0.9207613988490483, "step": 1040}, {"loss": 1.7527, "grad_norm": 0.29976674914360046, "learning_rate": 0.0002, "epoch": 0.9296148738379814, "step": 1050}, {"loss": 1.7753, "grad_norm": 0.35852617025375366, "learning_rate": 0.0002, "epoch": 0.9384683488269145, "step": 1060}, {"loss": 1.7507, "grad_norm": 0.3288591504096985, "learning_rate": 0.0002, "epoch": 0.9473218238158477, "step": 1070}, {"loss": 1.8155, "grad_norm": 0.32641634345054626, "learning_rate": 0.0002, "epoch": 0.9561752988047809, "step": 1080}, {"loss": 1.7912, "grad_norm": 0.3305715322494507, "learning_rate": 0.0002, "epoch": 0.965028773793714, "step": 1090}, {"loss": 1.8368, "grad_norm": 0.30650773644447327, "learning_rate": 0.0002, "epoch": 0.9738822487826472, "step": 1100}, {"loss": 1.6739, "grad_norm": 0.3330624997615814, "learning_rate": 0.0002, "epoch": 0.9827357237715804, "step": 1110}, {"loss": 1.8392, "grad_norm": 0.3173314034938812, "learning_rate": 0.0002, "epoch": 0.9915891987605135, "step": 1120}, {"eval_loss": 1.8095673322677612, "eval_runtime": 82.6312, "eval_samples_per_second": 6.233, "eval_steps_per_second": 0.787, "epoch": 0.9995573262505534, "step": 1129}, {"loss": 1.7997, "grad_norm": 0.3092995882034302, "learning_rate": 0.0002, "epoch": 1.0004426737494467, "step": 1130}, {"loss": 1.6958, "grad_norm": 0.34386494755744934, "learning_rate": 0.0002, "epoch": 1.0092961487383798, "step": 1140}, {"loss": 1.7149, "grad_norm": 0.2887897789478302, "learning_rate": 0.0002, "epoch": 1.0181496237273129, "step": 1150}, {"loss": 1.7377, "grad_norm": 0.3706893026828766, "learning_rate": 0.0002, "epoch": 1.0270030987162462, "step": 1160}, {"loss": 1.6604, "grad_norm": 0.34724316000938416, "learning_rate": 0.0002, "epoch": 1.0358565737051793, "step": 1170}, {"loss": 1.7749, "grad_norm": 0.41001757979393005, "learning_rate": 0.0002, "epoch": 1.0447100486941125, "step": 1180}, {"loss": 1.6332, "grad_norm": 0.34838348627090454, "learning_rate": 0.0002, "epoch": 1.0535635236830456, "step": 1190}, {"loss": 1.7416, "grad_norm": 0.37201181054115295, "learning_rate": 0.0002, "epoch": 1.0624169986719787, "step": 1200}, {"loss": 1.7707, "grad_norm": 0.36871352791786194, "learning_rate": 0.0002, "epoch": 1.071270473660912, "step": 1210}, {"loss": 1.6769, "grad_norm": 0.35687458515167236, "learning_rate": 0.0002, "epoch": 1.080123948649845, "step": 1220}, {"loss": 1.7235, "grad_norm": 0.3864741921424866, "learning_rate": 0.0002, "epoch": 1.0889774236387781, "step": 1230}, {"loss": 1.729, "grad_norm": 0.3496808707714081, "learning_rate": 0.0002, "epoch": 1.0978308986277114, "step": 1240}, {"loss": 1.7192, "grad_norm": 0.3444930911064148, "learning_rate": 0.0002, "epoch": 1.1066843736166445, "step": 1250}, {"loss": 1.6672, "grad_norm": 0.353188693523407, "learning_rate": 0.0002, "epoch": 1.1155378486055776, "step": 1260}, {"loss": 1.7634, "grad_norm": 0.3284400999546051, "learning_rate": 0.0002, "epoch": 1.1243913235945109, "step": 1270}, {"loss": 1.7441, "grad_norm": 0.3545348644256592, "learning_rate": 0.0002, "epoch": 1.133244798583444, "step": 1280}, {"loss": 1.7343, "grad_norm": 0.3489900529384613, "learning_rate": 0.0002, "epoch": 1.1420982735723773, "step": 1290}, {"loss": 1.6399, "grad_norm": 0.40355560183525085, "learning_rate": 0.0002, "epoch": 1.1509517485613103, "step": 1300}, {"loss": 1.7658, "grad_norm": 0.3369944095611572, "learning_rate": 0.0002, "epoch": 1.1598052235502434, "step": 1310}, {"loss": 1.7098, "grad_norm": 0.39141345024108887, "learning_rate": 0.0002, "epoch": 1.1686586985391767, "step": 1320}, {"loss": 1.6628, "grad_norm": 0.36518552899360657, "learning_rate": 0.0002, "epoch": 1.1775121735281098, "step": 1330}, {"loss": 1.6958, "grad_norm": 0.3730056583881378, "learning_rate": 0.0002, "epoch": 1.1863656485170428, "step": 1340}, {"loss": 1.7613, "grad_norm": 0.37711501121520996, "learning_rate": 0.0002, "epoch": 1.1952191235059761, "step": 1350}, {"loss": 1.6423, "grad_norm": 0.3627128005027771, "learning_rate": 0.0002, "epoch": 1.2040725984949092, "step": 1360}, {"loss": 1.7214, "grad_norm": 0.3458651006221771, "learning_rate": 0.0002, "epoch": 1.2129260734838425, "step": 1370}, {"loss": 1.6978, "grad_norm": 0.392395555973053, "learning_rate": 0.0002, "epoch": 1.2217795484727756, "step": 1380}, {"loss": 1.7785, "grad_norm": 0.3353286683559418, "learning_rate": 0.0002, "epoch": 1.2306330234617087, "step": 1390}, {"loss": 1.7019, "grad_norm": 0.9545007944107056, "learning_rate": 0.0002, "epoch": 1.239486498450642, "step": 1400}, {"loss": 1.725, "grad_norm": 0.37037935853004456, "learning_rate": 0.0002, "epoch": 1.248339973439575, "step": 1410}, {"loss": 1.6818, "grad_norm": 0.3831497132778168, "learning_rate": 0.0002, "epoch": 1.257193448428508, "step": 1420}, {"loss": 1.747, "grad_norm": 0.4633576273918152, "learning_rate": 0.0002, "epoch": 1.2660469234174414, "step": 1430}, {"loss": 1.6864, "grad_norm": 0.3690567910671234, "learning_rate": 0.0002, "epoch": 1.2749003984063745, "step": 1440}, {"loss": 1.767, "grad_norm": 0.33980098366737366, "learning_rate": 0.0002, "epoch": 1.2837538733953076, "step": 1450}, {"loss": 1.6989, "grad_norm": 0.3731277287006378, "learning_rate": 0.0002, "epoch": 1.2926073483842409, "step": 1460}, {"loss": 1.6801, "grad_norm": 0.3781551122665405, "learning_rate": 0.0002, "epoch": 1.301460823373174, "step": 1470}, {"loss": 1.7551, "grad_norm": 0.36511561274528503, "learning_rate": 0.0002, "epoch": 1.310314298362107, "step": 1480}, {"loss": 1.6629, "grad_norm": 0.3292245864868164, "learning_rate": 0.0002, "epoch": 1.3191677733510403, "step": 1490}, {"loss": 1.7098, "grad_norm": 0.38758566975593567, "learning_rate": 0.0002, "epoch": 1.3280212483399734, "step": 1500}, {"loss": 1.7364, "grad_norm": 0.3993414044380188, "learning_rate": 0.0002, "epoch": 1.3368747233289067, "step": 1510}, {"loss": 1.7202, "grad_norm": 0.35689303278923035, "learning_rate": 0.0002, "epoch": 1.3457281983178397, "step": 1520}, {"loss": 1.7082, "grad_norm": 0.41849321126937866, "learning_rate": 0.0002, "epoch": 1.354581673306773, "step": 1530}, {"loss": 1.7488, "grad_norm": 0.36752554774284363, "learning_rate": 0.0002, "epoch": 1.3634351482957061, "step": 1540}, {"loss": 1.7032, "grad_norm": 0.36915940046310425, "learning_rate": 0.0002, "epoch": 1.3722886232846392, "step": 1550}, {"loss": 1.6698, "grad_norm": 0.3656710386276245, "learning_rate": 0.0002, "epoch": 1.3811420982735725, "step": 1560}, {"loss": 1.7269, "grad_norm": 0.32055532932281494, "learning_rate": 0.0002, "epoch": 1.3899955732625056, "step": 1570}, {"loss": 1.8, "grad_norm": 0.35031241178512573, "learning_rate": 0.0002, "epoch": 1.3988490482514386, "step": 1580}, {"loss": 1.6667, "grad_norm": 0.44541189074516296, "learning_rate": 0.0002, "epoch": 1.407702523240372, "step": 1590}, {"loss": 1.8624, "grad_norm": 0.36922356486320496, "learning_rate": 0.0002, "epoch": 1.416555998229305, "step": 1600}, {"loss": 1.7011, "grad_norm": 0.3470565974712372, "learning_rate": 0.0002, "epoch": 1.425409473218238, "step": 1610}, {"loss": 1.6912, "grad_norm": 0.3743111193180084, "learning_rate": 0.0002, "epoch": 1.4342629482071714, "step": 1620}, {"loss": 1.752, "grad_norm": 0.3619250953197479, "learning_rate": 0.0002, "epoch": 1.4431164231961044, "step": 1630}, {"loss": 1.6919, "grad_norm": 0.4028145968914032, "learning_rate": 0.0002, "epoch": 1.4519698981850375, "step": 1640}, {"loss": 1.75, "grad_norm": 0.36065351963043213, "learning_rate": 0.0002, "epoch": 1.4608233731739708, "step": 1650}, {"loss": 1.8212, "grad_norm": 0.44304442405700684, "learning_rate": 0.0002, "epoch": 1.469676848162904, "step": 1660}, {"loss": 1.6691, "grad_norm": 0.35770007967948914, "learning_rate": 0.0002, "epoch": 1.478530323151837, "step": 1670}, {"loss": 1.7588, "grad_norm": 0.37584400177001953, "learning_rate": 0.0002, "epoch": 1.4873837981407703, "step": 1680}, {"loss": 1.63, "grad_norm": 0.37151241302490234, "learning_rate": 0.0002, "epoch": 1.4962372731297033, "step": 1690}, {"loss": 1.6675, "grad_norm": 0.36422812938690186, "learning_rate": 0.0002, "epoch": 1.5050907481186364, "step": 1700}, {"loss": 1.7045, "grad_norm": 0.3680015206336975, "learning_rate": 0.0002, "epoch": 1.5139442231075697, "step": 1710}, {"loss": 1.6917, "grad_norm": 0.3356926441192627, "learning_rate": 0.0002, "epoch": 1.522797698096503, "step": 1720}, {"loss": 1.7108, "grad_norm": 0.37887054681777954, "learning_rate": 0.0002, "epoch": 1.531651173085436, "step": 1730}, {"loss": 1.7001, "grad_norm": 0.37052762508392334, "learning_rate": 0.0002, "epoch": 1.5405046480743692, "step": 1740}, {"loss": 1.6677, "grad_norm": 0.333925724029541, "learning_rate": 0.0002, "epoch": 1.5493581230633025, "step": 1750}, {"loss": 1.7159, "grad_norm": 0.3722778558731079, "learning_rate": 0.0002, "epoch": 1.5582115980522355, "step": 1760}, {"loss": 1.6923, "grad_norm": 0.3331141173839569, "learning_rate": 0.0002, "epoch": 1.5670650730411686, "step": 1770}, {"loss": 1.7444, "grad_norm": 0.3670045733451843, "learning_rate": 0.0002, "epoch": 1.575918548030102, "step": 1780}, {"loss": 1.7092, "grad_norm": 0.3769885301589966, "learning_rate": 0.0002, "epoch": 1.584772023019035, "step": 1790}, {"loss": 1.6689, "grad_norm": 0.4266890287399292, "learning_rate": 0.0002, "epoch": 1.593625498007968, "step": 1800}, {"loss": 1.6859, "grad_norm": 0.37174347043037415, "learning_rate": 0.0002, "epoch": 1.6024789729969013, "step": 1810}, {"loss": 1.6793, "grad_norm": 0.3599846363067627, "learning_rate": 0.0002, "epoch": 1.6113324479858344, "step": 1820}, {"loss": 1.6836, "grad_norm": 0.3364820182323456, "learning_rate": 0.0002, "epoch": 1.6201859229747675, "step": 1830}, {"loss": 1.7278, "grad_norm": 0.3874799907207489, "learning_rate": 0.0002, "epoch": 1.6290393979637008, "step": 1840}, {"loss": 1.705, "grad_norm": 0.3706085681915283, "learning_rate": 0.0002, "epoch": 1.6378928729526339, "step": 1850}, {"loss": 1.6761, "grad_norm": 0.3997809886932373, "learning_rate": 0.0002, "epoch": 1.646746347941567, "step": 1860}, {"loss": 1.7983, "grad_norm": 0.4033166170120239, "learning_rate": 0.0002, "epoch": 1.6555998229305002, "step": 1870}, {"loss": 1.6518, "grad_norm": 0.3944370150566101, "learning_rate": 0.0002, "epoch": 1.6644532979194335, "step": 1880}, {"loss": 1.6017, "grad_norm": 0.3467825651168823, "learning_rate": 0.0002, "epoch": 1.6733067729083664, "step": 1890}, {"loss": 1.7462, "grad_norm": 0.35290950536727905, "learning_rate": 0.0002, "epoch": 1.6821602478972997, "step": 1900}, {"loss": 1.7634, "grad_norm": 0.3664521872997284, "learning_rate": 0.0002, "epoch": 1.691013722886233, "step": 1910}, {"loss": 1.7922, "grad_norm": 0.33863595128059387, "learning_rate": 0.0002, "epoch": 1.699867197875166, "step": 1920}, {"loss": 1.7048, "grad_norm": 0.34726113080978394, "learning_rate": 0.0002, "epoch": 1.7087206728640991, "step": 1930}, {"loss": 1.6664, "grad_norm": 0.35060688853263855, "learning_rate": 0.0002, "epoch": 1.7175741478530324, "step": 1940}, {"loss": 1.7577, "grad_norm": 0.33741647005081177, "learning_rate": 0.0002, "epoch": 1.7264276228419655, "step": 1950}, {"loss": 1.6971, "grad_norm": 0.36190304160118103, "learning_rate": 0.0002, "epoch": 1.7352810978308986, "step": 1960}, {"loss": 1.7238, "grad_norm": 0.3412845730781555, "learning_rate": 0.0002, "epoch": 1.7441345728198319, "step": 1970}, {"loss": 1.7038, "grad_norm": 0.3841935694217682, "learning_rate": 0.0002, "epoch": 1.752988047808765, "step": 1980}, {"loss": 1.7185, "grad_norm": 0.39062076807022095, "learning_rate": 0.0002, "epoch": 1.761841522797698, "step": 1990}, {"loss": 1.7346, "grad_norm": 0.3741697669029236, "learning_rate": 0.0002, "epoch": 1.7706949977866313, "step": 2000}, {"loss": 1.6864, "grad_norm": 0.4160231053829193, "learning_rate": 0.0002, "epoch": 1.7795484727755644, "step": 2010}, {"loss": 1.7572, "grad_norm": 0.3602111339569092, "learning_rate": 0.0002, "epoch": 1.7884019477644975, "step": 2020}, {"loss": 1.6139, "grad_norm": 0.36740878224372864, "learning_rate": 0.0002, "epoch": 1.7972554227534308, "step": 2030}, {"loss": 1.7043, "grad_norm": 0.419039249420166, "learning_rate": 0.0002, "epoch": 1.8061088977423638, "step": 2040}, {"loss": 1.7847, "grad_norm": 0.3511838912963867, "learning_rate": 0.0002, "epoch": 1.814962372731297, "step": 2050}, {"loss": 1.6477, "grad_norm": 0.3580166697502136, "learning_rate": 0.0002, "epoch": 1.8238158477202302, "step": 2060}, {"loss": 1.7562, "grad_norm": 0.40928223729133606, "learning_rate": 0.0002, "epoch": 1.8326693227091635, "step": 2070}, {"loss": 1.7356, "grad_norm": 0.37134310603141785, "learning_rate": 0.0002, "epoch": 1.8415227976980963, "step": 2080}, {"loss": 1.6829, "grad_norm": 0.3924112319946289, "learning_rate": 0.0002, "epoch": 1.8503762726870296, "step": 2090}, {"loss": 1.6785, "grad_norm": 0.3215042054653168, "learning_rate": 0.0002, "epoch": 1.859229747675963, "step": 2100}, {"loss": 1.6864, "grad_norm": 0.37674015760421753, "learning_rate": 0.0002, "epoch": 1.868083222664896, "step": 2110}, {"loss": 1.7313, "grad_norm": 0.370856374502182, "learning_rate": 0.0002, "epoch": 1.876936697653829, "step": 2120}, {"loss": 1.7163, "grad_norm": 0.35783782601356506, "learning_rate": 0.0002, "epoch": 1.8857901726427624, "step": 2130}, {"loss": 1.7655, "grad_norm": 0.39538058638572693, "learning_rate": 0.0002, "epoch": 1.8946436476316955, "step": 2140}, {"loss": 1.6614, "grad_norm": 0.36677780747413635, "learning_rate": 0.0002, "epoch": 1.9034971226206285, "step": 2150}, {"loss": 1.6959, "grad_norm": 0.39032700657844543, "learning_rate": 0.0002, "epoch": 1.9123505976095618, "step": 2160}, {"loss": 1.7643, "grad_norm": 0.39762043952941895, "learning_rate": 0.0002, "epoch": 1.921204072598495, "step": 2170}, {"loss": 1.6767, "grad_norm": 0.5400257110595703, "learning_rate": 0.0002, "epoch": 1.930057547587428, "step": 2180}, {"loss": 1.7262, "grad_norm": 0.3650212287902832, "learning_rate": 0.0002, "epoch": 1.9389110225763613, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3583165109157562, "learning_rate": 0.0002, "epoch": 1.9477644975652944, "step": 2200}, {"loss": 1.7241, "grad_norm": 0.4031282365322113, "learning_rate": 0.0002, "epoch": 1.9566179725542274, "step": 2210}, {"loss": 1.7617, "grad_norm": 0.3673221170902252, "learning_rate": 0.0002, "epoch": 1.9654714475431607, "step": 2220}, {"loss": 1.6862, "grad_norm": 0.3920327126979828, "learning_rate": 0.0002, "epoch": 1.9743249225320938, "step": 2230}, {"loss": 1.7192, "grad_norm": 0.4765491783618927, "learning_rate": 0.0002, "epoch": 1.9831783975210269, "step": 2240}, {"loss": 1.7759, "grad_norm": 0.38130584359169006, "learning_rate": 0.0002, "epoch": 1.9920318725099602, "step": 2250}, {"eval_loss": 1.8077166080474854, "eval_runtime": 82.8351, "eval_samples_per_second": 6.217, "eval_steps_per_second": 0.785, "epoch": 2.0, "step": 2259}, {"loss": 1.7081, "grad_norm": 0.34340235590934753, "learning_rate": 0.0002, "epoch": 2.0008853474988935, "step": 2260}, {"loss": 1.6815, "grad_norm": 0.3710762858390808, "learning_rate": 0.0002, "epoch": 2.0097388224878263, "step": 2270}, {"loss": 1.5828, "grad_norm": 0.35640114545822144, "learning_rate": 0.0002, "epoch": 2.0185922974767596, "step": 2280}, {"loss": 1.6322, "grad_norm": 0.45970189571380615, "learning_rate": 0.0002, "epoch": 2.027445772465693, "step": 2290}, {"loss": 1.5598, "grad_norm": 0.4256797134876251, "learning_rate": 0.0002, "epoch": 2.0362992474546258, "step": 2300}, {"loss": 1.6271, "grad_norm": 0.42421531677246094, "learning_rate": 0.0002, "epoch": 2.045152722443559, "step": 2310}, {"loss": 1.6117, "grad_norm": 0.4032478928565979, "learning_rate": 0.0002, "epoch": 2.0540061974324924, "step": 2320}, {"loss": 1.6389, "grad_norm": 0.4073623716831207, "learning_rate": 0.0002, "epoch": 2.062859672421425, "step": 2330}, {"loss": 1.6527, "grad_norm": 0.4845200777053833, "learning_rate": 0.0002, "epoch": 2.0717131474103585, "step": 2340}, {"loss": 1.5734, "grad_norm": 0.40578293800354004, "learning_rate": 0.0002, "epoch": 2.080566622399292, "step": 2350}, {"loss": 1.5853, "grad_norm": 0.4037284255027771, "learning_rate": 0.0002, "epoch": 2.089420097388225, "step": 2360}, {"loss": 1.6511, "grad_norm": 0.4717613160610199, "learning_rate": 0.0002, "epoch": 2.098273572377158, "step": 2370}, {"loss": 1.6273, "grad_norm": 0.42076411843299866, "learning_rate": 0.0002, "epoch": 2.1071270473660912, "step": 2380}, {"loss": 1.654, "grad_norm": 0.47799113392829895, "learning_rate": 0.0002, "epoch": 2.1159805223550245, "step": 2390}, {"loss": 1.5528, "grad_norm": 0.4253084063529968, "learning_rate": 0.0002, "epoch": 2.1248339973439574, "step": 2400}, {"loss": 1.6432, "grad_norm": 0.5023085474967957, "learning_rate": 0.0002, "epoch": 2.1336874723328907, "step": 2410}, {"loss": 1.5926, "grad_norm": 0.49162712693214417, "learning_rate": 0.0002, "epoch": 2.142540947321824, "step": 2420}, {"loss": 1.5779, "grad_norm": 0.39035019278526306, "learning_rate": 0.0002, "epoch": 2.151394422310757, "step": 2430}, {"loss": 1.7526, "grad_norm": 0.43223854899406433, "learning_rate": 0.0002, "epoch": 2.16024789729969, "step": 2440}, {"loss": 1.6334, "grad_norm": 0.4596616327762604, "learning_rate": 0.0002, "epoch": 2.1691013722886234, "step": 2450}, {"loss": 1.6067, "grad_norm": 0.4469447731971741, "learning_rate": 0.0002, "epoch": 2.1779548472775563, "step": 2460}, {"loss": 1.5806, "grad_norm": 0.5100595355033875, "learning_rate": 0.0002, "epoch": 2.1868083222664896, "step": 2470}, {"loss": 1.6456, "grad_norm": 0.4169430732727051, "learning_rate": 0.0002, "epoch": 2.195661797255423, "step": 2480}, {"loss": 1.6734, "grad_norm": 0.4699254035949707, "learning_rate": 0.0002, "epoch": 2.2045152722443557, "step": 2490}, {"loss": 1.6259, "grad_norm": 0.43524250388145447, "learning_rate": 0.0002, "epoch": 2.213368747233289, "step": 2500}, {"loss": 1.6717, "grad_norm": 0.4496648907661438, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2510}, {"loss": 1.6735, "grad_norm": 0.43408212065696716, "learning_rate": 0.0002, "epoch": 2.231075697211155, "step": 2520}, {"loss": 1.611, "grad_norm": 0.4596034288406372, "learning_rate": 0.0002, "epoch": 2.2399291722000885, "step": 2530}, {"loss": 1.6271, "grad_norm": 0.5217021107673645, "learning_rate": 0.0002, "epoch": 2.2487826471890218, "step": 2540}, {"loss": 1.6027, "grad_norm": 0.44745638966560364, "learning_rate": 0.0002, "epoch": 2.2576361221779546, "step": 2550}, {"loss": 1.675, "grad_norm": 0.4484798014163971, "learning_rate": 0.0002, "epoch": 2.266489597166888, "step": 2560}, {"loss": 1.5321, "grad_norm": 0.4428067207336426, "learning_rate": 0.0002, "epoch": 2.275343072155821, "step": 2570}, {"loss": 1.6716, "grad_norm": 0.5095171332359314, "learning_rate": 0.0002, "epoch": 2.2841965471447545, "step": 2580}, {"loss": 1.5661, "grad_norm": 0.44833096861839294, "learning_rate": 0.0002, "epoch": 2.2930500221336874, "step": 2590}, {"loss": 1.652, "grad_norm": 0.507905900478363, "learning_rate": 0.0002, "epoch": 2.3019034971226207, "step": 2600}, {"loss": 1.5963, "grad_norm": 0.40808171033859253, "learning_rate": 0.0002, "epoch": 2.310756972111554, "step": 2610}, {"loss": 1.6574, "grad_norm": 0.4684814214706421, "learning_rate": 0.0002, "epoch": 2.319610447100487, "step": 2620}, {"loss": 1.587, "grad_norm": 0.44864922761917114, "learning_rate": 0.0002, "epoch": 2.32846392208942, "step": 2630}, {"loss": 1.5828, "grad_norm": 0.4174162745475769, "learning_rate": 0.0002, "epoch": 2.3373173970783534, "step": 2640}, {"loss": 1.642, "grad_norm": 0.42314743995666504, "learning_rate": 0.0002, "epoch": 2.3461708720672863, "step": 2650}, {"loss": 1.5884, "grad_norm": 0.49224185943603516, "learning_rate": 0.0002, "epoch": 2.3550243470562195, "step": 2660}, {"loss": 1.5766, "grad_norm": 0.45190292596817017, "learning_rate": 0.0002, "epoch": 2.363877822045153, "step": 2670}, {"loss": 1.6284, "grad_norm": 0.41817107796669006, "learning_rate": 0.0002, "epoch": 2.3727312970340857, "step": 2680}, {"loss": 1.6356, "grad_norm": 0.6436763405799866, "learning_rate": 0.0002, "epoch": 2.381584772023019, "step": 2690}, {"loss": 1.5915, "grad_norm": 0.47175949811935425, "learning_rate": 0.0002, "epoch": 2.3904382470119523, "step": 2700}, {"loss": 1.6303, "grad_norm": 0.480339378118515, "learning_rate": 0.0002, "epoch": 2.3992917220008856, "step": 2710}, {"loss": 1.5697, "grad_norm": 0.4723486006259918, "learning_rate": 0.0002, "epoch": 2.4081451969898184, "step": 2720}, {"loss": 1.54, "grad_norm": 0.4305492043495178, "learning_rate": 0.0002, "epoch": 2.4169986719787517, "step": 2730}, {"loss": 1.71, "grad_norm": 0.5007492303848267, "learning_rate": 0.0002, "epoch": 2.425852146967685, "step": 2740}, {"loss": 1.5369, "grad_norm": 0.5374062061309814, "learning_rate": 0.0002, "epoch": 2.434705621956618, "step": 2750}, {"loss": 1.6156, "grad_norm": 0.45866212248802185, "learning_rate": 0.0002, "epoch": 2.443559096945551, "step": 2760}, {"loss": 1.6066, "grad_norm": 0.47914502024650574, "learning_rate": 0.0002, "epoch": 2.4524125719344845, "step": 2770}, {"loss": 1.5644, "grad_norm": 0.43804746866226196, "learning_rate": 0.0002, "epoch": 2.4612660469234173, "step": 2780}, {"loss": 1.5952, "grad_norm": 0.43656906485557556, "learning_rate": 0.0002, "epoch": 2.4701195219123506, "step": 2790}, {"loss": 1.6311, "grad_norm": 0.4820363521575928, "learning_rate": 0.0002, "epoch": 2.478972996901284, "step": 2800}, {"loss": 1.5375, "grad_norm": 0.4916800558567047, "learning_rate": 0.0002, "epoch": 2.4878264718902168, "step": 2810}, {"loss": 1.5736, "grad_norm": 0.4521256983280182, "learning_rate": 0.0002, "epoch": 2.49667994687915, "step": 2820}, {"loss": 1.6179, "grad_norm": 0.5066806674003601, "learning_rate": 0.0002, "epoch": 2.5055334218680834, "step": 2830}, {"loss": 1.5812, "grad_norm": 0.4768151640892029, "learning_rate": 0.0002, "epoch": 2.514386896857016, "step": 2840}, {"loss": 1.6719, "grad_norm": 0.5144683718681335, "learning_rate": 0.0002, "epoch": 2.5232403718459495, "step": 2850}, {"loss": 1.6063, "grad_norm": 0.4718942940235138, "learning_rate": 0.0002, "epoch": 2.532093846834883, "step": 2860}, {"loss": 1.6099, "grad_norm": 0.4924587309360504, "learning_rate": 0.0002, "epoch": 2.5409473218238157, "step": 2870}, {"loss": 1.5994, "grad_norm": 0.4649953842163086, "learning_rate": 0.0002, "epoch": 2.549800796812749, "step": 2880}, {"loss": 1.6501, "grad_norm": 0.4836665987968445, "learning_rate": 0.0002, "epoch": 2.5586542718016823, "step": 2890}, {"loss": 1.6518, "grad_norm": 0.4162124991416931, "learning_rate": 0.0002, "epoch": 2.567507746790615, "step": 2900}, {"loss": 1.6471, "grad_norm": 0.4894537925720215, "learning_rate": 0.0002, "epoch": 2.5763612217795484, "step": 2910}, {"loss": 1.6123, "grad_norm": 0.4539397358894348, "learning_rate": 0.0002, "epoch": 2.5852146967684817, "step": 2920}, {"loss": 1.6449, "grad_norm": 0.4718773066997528, "learning_rate": 0.0002, "epoch": 2.5940681717574146, "step": 2930}, {"loss": 1.584, "grad_norm": 0.49989837408065796, "learning_rate": 0.0002, "epoch": 2.602921646746348, "step": 2940}, {"loss": 1.6087, "grad_norm": 0.4862406849861145, "learning_rate": 0.0002, "epoch": 2.611775121735281, "step": 2950}, {"loss": 1.6057, "grad_norm": 0.4244804382324219, "learning_rate": 0.0002, "epoch": 2.620628596724214, "step": 2960}, {"loss": 1.7795, "grad_norm": 0.49304354190826416, "learning_rate": 0.0002, "epoch": 2.6294820717131473, "step": 2970}, {"loss": 1.7255, "grad_norm": 0.4818236529827118, "learning_rate": 0.0002, "epoch": 2.6383355467020806, "step": 2980}, {"loss": 1.621, "grad_norm": 0.5077425837516785, "learning_rate": 0.0002, "epoch": 2.647189021691014, "step": 2990}, {"loss": 1.7064, "grad_norm": 0.4494157135486603, "learning_rate": 0.0002, "epoch": 2.6560424966799467, "step": 3000}, {"loss": 1.6792, "grad_norm": 0.4790278971195221, "learning_rate": 0.0002, "epoch": 2.66489597166888, "step": 3010}, {"loss": 1.6082, "grad_norm": 0.4702624976634979, "learning_rate": 0.0002, "epoch": 2.6737494466578133, "step": 3020}, {"loss": 1.6494, "grad_norm": 0.5082133412361145, "learning_rate": 0.0002, "epoch": 2.682602921646746, "step": 3030}, {"loss": 1.6438, "grad_norm": 0.4553256630897522, "learning_rate": 0.0002, "epoch": 2.6914563966356795, "step": 3040}, {"loss": 1.6155, "grad_norm": 0.4492715001106262, "learning_rate": 0.0002, "epoch": 2.700309871624613, "step": 3050}, {"loss": 1.5367, "grad_norm": 0.4555944502353668, "learning_rate": 0.0002, "epoch": 2.709163346613546, "step": 3060}, {"loss": 1.5793, "grad_norm": 0.5879693031311035, "learning_rate": 0.0002, "epoch": 2.718016821602479, "step": 3070}, {"loss": 1.6357, "grad_norm": 0.4628562927246094, "learning_rate": 0.0002, "epoch": 2.7268702965914122, "step": 3080}, {"loss": 1.6585, "grad_norm": 0.5169575810432434, "learning_rate": 0.0002, "epoch": 2.7357237715803455, "step": 3090}, {"loss": 1.562, "grad_norm": 0.4630090892314911, "learning_rate": 0.0002, "epoch": 2.7445772465692784, "step": 3100}, {"loss": 1.5508, "grad_norm": 0.5437219738960266, "learning_rate": 0.0002, "epoch": 2.7534307215582117, "step": 3110}, {"loss": 1.6442, "grad_norm": 0.5102152228355408, "learning_rate": 0.0002, "epoch": 2.762284196547145, "step": 3120}, {"loss": 1.5448, "grad_norm": 0.48287826776504517, "learning_rate": 0.0002, "epoch": 2.771137671536078, "step": 3130}, {"loss": 1.6657, "grad_norm": 0.4671737253665924, "learning_rate": 0.0002, "epoch": 2.779991146525011, "step": 3140}, {"loss": 1.5864, "grad_norm": 0.5177035331726074, "learning_rate": 0.0002, "epoch": 2.7888446215139444, "step": 3150}, {"loss": 1.5617, "grad_norm": 0.450989305973053, "learning_rate": 0.0002, "epoch": 2.7976980965028773, "step": 3160}, {"loss": 1.597, "grad_norm": 0.45007848739624023, "learning_rate": 0.0002, "epoch": 2.8065515714918106, "step": 3170}, {"loss": 1.7179, "grad_norm": 0.4600294530391693, "learning_rate": 0.0002, "epoch": 2.815405046480744, "step": 3180}, {"loss": 1.6441, "grad_norm": 0.485628604888916, "learning_rate": 0.0002, "epoch": 2.8242585214696767, "step": 3190}, {"loss": 1.6396, "grad_norm": 0.49811574816703796, "learning_rate": 0.0002, "epoch": 2.83311199645861, "step": 3200}, {"loss": 1.6067, "grad_norm": 0.5012516975402832, "learning_rate": 0.0002, "epoch": 2.8419654714475433, "step": 3210}, {"loss": 1.6188, "grad_norm": 0.4552757740020752, "learning_rate": 0.0002, "epoch": 2.850818946436476, "step": 3220}, {"loss": 1.5993, "grad_norm": 0.4539635479450226, "learning_rate": 0.0002, "epoch": 2.8596724214254094, "step": 3230}, {"loss": 1.5957, "grad_norm": 0.5534685850143433, "learning_rate": 0.0002, "epoch": 2.8685258964143427, "step": 3240}, {"loss": 1.6065, "grad_norm": 0.4570811688899994, "learning_rate": 0.0002, "epoch": 2.8773793714032756, "step": 3250}, {"loss": 1.6016, "grad_norm": 0.48181653022766113, "learning_rate": 0.0002, "epoch": 2.886232846392209, "step": 3260}, {"loss": 1.6574, "grad_norm": 0.4871032238006592, "learning_rate": 0.0002, "epoch": 2.895086321381142, "step": 3270}, {"loss": 1.5626, "grad_norm": 0.4643239676952362, "learning_rate": 0.0002, "epoch": 2.903939796370075, "step": 3280}, {"loss": 1.5981, "grad_norm": 0.5024484395980835, "learning_rate": 0.0002, "epoch": 2.9127932713590083, "step": 3290}, {"loss": 1.5756, "grad_norm": 0.4425384998321533, "learning_rate": 0.0002, "epoch": 2.9216467463479416, "step": 3300}, {"loss": 1.644, "grad_norm": 0.459168016910553, "learning_rate": 0.0002, "epoch": 2.9305002213368745, "step": 3310}, {"loss": 1.6404, "grad_norm": 0.4950717091560364, "learning_rate": 0.0002, "epoch": 2.939353696325808, "step": 3320}, {"loss": 1.652, "grad_norm": 0.4516230523586273, "learning_rate": 0.0002, "epoch": 2.948207171314741, "step": 3330}, {"loss": 1.5917, "grad_norm": 0.49523285031318665, "learning_rate": 0.0002, "epoch": 2.957060646303674, "step": 3340}, {"loss": 1.733, "grad_norm": 0.49282631278038025, "learning_rate": 0.0002, "epoch": 2.9659141212926072, "step": 3350}, {"loss": 1.6519, "grad_norm": 0.45825016498565674, "learning_rate": 0.0002, "epoch": 2.9747675962815405, "step": 3360}, {"loss": 1.6607, "grad_norm": 0.4952891170978546, "learning_rate": 0.0002, "epoch": 2.983621071270474, "step": 3370}, {"loss": 1.5981, "grad_norm": 0.42182639241218567, "learning_rate": 0.0002, "epoch": 2.9924745462594067, "step": 3380}, {"eval_loss": 1.8308420181274414, "eval_runtime": 82.786, "eval_samples_per_second": 6.221, "eval_steps_per_second": 0.785, "epoch": 2.9995573262505535, "step": 3388}, {"loss": 1.5811, "grad_norm": 0.47721418738365173, "learning_rate": 0.0002, "epoch": 3.00132802124834, "step": 3390}, {"loss": 1.5137, "grad_norm": 0.5284923911094666, "learning_rate": 0.0002, "epoch": 3.0101814962372733, "step": 3400}, {"loss": 1.437, "grad_norm": 0.5607061982154846, "learning_rate": 0.0002, "epoch": 3.019034971226206, "step": 3410}, {"loss": 1.4909, "grad_norm": 0.5271363258361816, "learning_rate": 0.0002, "epoch": 3.0278884462151394, "step": 3420}, {"loss": 1.5645, "grad_norm": 0.48660898208618164, "learning_rate": 0.0002, "epoch": 3.0367419212040727, "step": 3430}, {"loss": 1.4754, "grad_norm": 0.5767933130264282, "learning_rate": 0.0002, "epoch": 3.0455953961930056, "step": 3440}, {"loss": 1.4647, "grad_norm": 0.5591282248497009, "learning_rate": 0.0002, "epoch": 3.054448871181939, "step": 3450}, {"loss": 1.5112, "grad_norm": 0.5870814323425293, "learning_rate": 0.0002, "epoch": 3.063302346170872, "step": 3460}, {"loss": 1.4682, "grad_norm": 0.4861546456813812, "learning_rate": 0.0002, "epoch": 3.072155821159805, "step": 3470}, {"loss": 1.4883, "grad_norm": 0.5238925814628601, "learning_rate": 0.0002, "epoch": 3.0810092961487383, "step": 3480}, {"loss": 1.4855, "grad_norm": 0.5521751046180725, "learning_rate": 0.0002, "epoch": 3.0898627711376716, "step": 3490}, {"loss": 1.4454, "grad_norm": 0.5816575884819031, "learning_rate": 0.0002, "epoch": 3.098716246126605, "step": 3500}, {"loss": 1.5113, "grad_norm": 0.5281513333320618, "learning_rate": 0.0002, "epoch": 3.1075697211155378, "step": 3510}, {"loss": 1.4723, "grad_norm": 0.5847303867340088, "learning_rate": 0.0002, "epoch": 3.116423196104471, "step": 3520}, {"loss": 1.5513, "grad_norm": 0.5683517456054688, "learning_rate": 0.0002, "epoch": 3.1252766710934043, "step": 3530}, {"loss": 1.532, "grad_norm": 0.5177015662193298, "learning_rate": 0.0002, "epoch": 3.134130146082337, "step": 3540}, {"loss": 1.4921, "grad_norm": 0.5922423601150513, "learning_rate": 0.0002, "epoch": 3.1429836210712705, "step": 3550}, {"loss": 1.5329, "grad_norm": 0.7018587589263916, "learning_rate": 0.0002, "epoch": 3.151837096060204, "step": 3560}, {"loss": 1.4677, "grad_norm": 0.6152004599571228, "learning_rate": 0.0002, "epoch": 3.1606905710491366, "step": 3570}, {"loss": 1.4288, "grad_norm": 0.5350717902183533, "learning_rate": 0.0002, "epoch": 3.16954404603807, "step": 3580}, {"loss": 1.4739, "grad_norm": 0.5971009731292725, "learning_rate": 0.0002, "epoch": 3.1783975210270032, "step": 3590}, {"loss": 1.541, "grad_norm": 0.7312001585960388, "learning_rate": 0.0002, "epoch": 3.187250996015936, "step": 3600}, {"loss": 1.5803, "grad_norm": 0.6372535228729248, "learning_rate": 0.0002, "epoch": 3.1961044710048694, "step": 3610}, {"loss": 1.4642, "grad_norm": 0.6098020672798157, "learning_rate": 0.0002, "epoch": 3.2049579459938027, "step": 3620}, {"loss": 1.5149, "grad_norm": 0.5506435632705688, "learning_rate": 0.0002, "epoch": 3.2138114209827355, "step": 3630}, {"loss": 1.4338, "grad_norm": 0.6043022274971008, "learning_rate": 0.0002, "epoch": 3.222664895971669, "step": 3640}, {"loss": 1.5351, "grad_norm": 0.5495519042015076, "learning_rate": 0.0002, "epoch": 3.231518370960602, "step": 3650}, {"loss": 1.3879, "grad_norm": 0.5769572257995605, "learning_rate": 0.0002, "epoch": 3.240371845949535, "step": 3660}, {"loss": 1.4604, "grad_norm": 0.6833786964416504, "learning_rate": 0.0002, "epoch": 3.2492253209384683, "step": 3670}, {"loss": 1.5091, "grad_norm": 0.6962856650352478, "learning_rate": 0.0002, "epoch": 3.2580787959274016, "step": 3680}, {"loss": 1.5212, "grad_norm": 0.6553098559379578, "learning_rate": 0.0002, "epoch": 3.2669322709163344, "step": 3690}, {"loss": 1.5416, "grad_norm": 0.5907557010650635, "learning_rate": 0.0002, "epoch": 3.2757857459052677, "step": 3700}, {"loss": 1.5012, "grad_norm": 0.5712862014770508, "learning_rate": 0.0002, "epoch": 3.284639220894201, "step": 3710}, {"loss": 1.5073, "grad_norm": 0.573820948600769, "learning_rate": 0.0002, "epoch": 3.2934926958831343, "step": 3720}, {"loss": 1.544, "grad_norm": 0.6650304198265076, "learning_rate": 0.0002, "epoch": 3.302346170872067, "step": 3730}, {"loss": 1.5069, "grad_norm": 0.5182583928108215, "learning_rate": 0.0002, "epoch": 3.3111996458610005, "step": 3740}, {"loss": 1.5254, "grad_norm": 0.5078902840614319, "learning_rate": 0.0002, "epoch": 3.3200531208499338, "step": 3750}, {"loss": 1.4881, "grad_norm": 0.7062374353408813, "learning_rate": 0.0002, "epoch": 3.3289065958388666, "step": 3760}, {"loss": 1.5017, "grad_norm": 0.5711262822151184, "learning_rate": 0.0002, "epoch": 3.3377600708278, "step": 3770}, {"loss": 1.4982, "grad_norm": 0.5624606013298035, "learning_rate": 0.0002, "epoch": 3.346613545816733, "step": 3780}, {"loss": 1.4515, "grad_norm": 0.6008231043815613, "learning_rate": 0.0002, "epoch": 3.355467020805666, "step": 3790}, {"loss": 1.5038, "grad_norm": 0.6120018362998962, "learning_rate": 0.0002, "epoch": 3.3643204957945994, "step": 3800}, {"loss": 1.4918, "grad_norm": 0.5679979920387268, "learning_rate": 0.0002, "epoch": 3.3731739707835326, "step": 3810}, {"loss": 1.5435, "grad_norm": 0.5613794922828674, "learning_rate": 0.0002, "epoch": 3.3820274457724655, "step": 3820}, {"loss": 1.5319, "grad_norm": 0.5328839421272278, "learning_rate": 0.0002, "epoch": 3.390880920761399, "step": 3830}, {"loss": 1.5262, "grad_norm": 0.5960017442703247, "learning_rate": 0.0002, "epoch": 3.399734395750332, "step": 3840}, {"loss": 1.4227, "grad_norm": 0.5264106392860413, "learning_rate": 0.0002, "epoch": 3.4085878707392654, "step": 3850}, {"loss": 1.4766, "grad_norm": 0.6378359198570251, "learning_rate": 0.0002, "epoch": 3.4174413457281982, "step": 3860}, {"loss": 1.4898, "grad_norm": 0.5792967677116394, "learning_rate": 0.0002, "epoch": 3.4262948207171315, "step": 3870}, {"loss": 1.4914, "grad_norm": 0.6836280822753906, "learning_rate": 0.0002, "epoch": 3.435148295706065, "step": 3880}, {"loss": 1.5002, "grad_norm": 0.6073971390724182, "learning_rate": 0.0002, "epoch": 3.4440017706949977, "step": 3890}, {"loss": 1.4473, "grad_norm": 0.5753195881843567, "learning_rate": 0.0002, "epoch": 3.452855245683931, "step": 3900}, {"loss": 1.5332, "grad_norm": 0.6007646918296814, "learning_rate": 0.0002, "epoch": 3.4617087206728643, "step": 3910}, {"loss": 1.515, "grad_norm": 0.6025636196136475, "learning_rate": 0.0002, "epoch": 3.470562195661797, "step": 3920}, {"loss": 1.4612, "grad_norm": 0.6819562315940857, "learning_rate": 0.0002, "epoch": 3.4794156706507304, "step": 3930}, {"loss": 1.518, "grad_norm": 0.6448395848274231, "learning_rate": 0.0002, "epoch": 3.4882691456396637, "step": 3940}, {"loss": 1.5194, "grad_norm": 0.5712178945541382, "learning_rate": 0.0002, "epoch": 3.4971226206285966, "step": 3950}, {"loss": 1.4757, "grad_norm": 0.6300532817840576, "learning_rate": 0.0002, "epoch": 3.50597609561753, "step": 3960}, {"loss": 1.5142, "grad_norm": 0.6120840907096863, "learning_rate": 0.0002, "epoch": 3.514829570606463, "step": 3970}, {"loss": 1.559, "grad_norm": 0.6887575387954712, "learning_rate": 0.0002, "epoch": 3.523683045595396, "step": 3980}, {"loss": 1.5591, "grad_norm": 0.6970235109329224, "learning_rate": 0.0002, "epoch": 3.5325365205843293, "step": 3990}, {"loss": 1.5198, "grad_norm": 0.5818213820457458, "learning_rate": 0.0002, "epoch": 3.5413899955732626, "step": 4000}, {"loss": 1.5367, "grad_norm": 1.0533310174942017, "learning_rate": 0.0002, "epoch": 3.5502434705621955, "step": 4010}, {"loss": 1.5399, "grad_norm": 0.5444280505180359, "learning_rate": 0.0002, "epoch": 3.5590969455511288, "step": 4020}, {"loss": 1.5573, "grad_norm": 0.6007506847381592, "learning_rate": 0.0002, "epoch": 3.567950420540062, "step": 4030}, {"loss": 1.5059, "grad_norm": 0.6088743805885315, "learning_rate": 0.0002, "epoch": 3.576803895528995, "step": 4040}, {"loss": 1.5174, "grad_norm": 0.5934239029884338, "learning_rate": 0.0002, "epoch": 3.585657370517928, "step": 4050}, {"loss": 1.4938, "grad_norm": 0.605251669883728, "learning_rate": 0.0002, "epoch": 3.5945108455068615, "step": 4060}, {"loss": 1.5142, "grad_norm": 0.5903469920158386, "learning_rate": 0.0002, "epoch": 3.6033643204957944, "step": 4070}, {"loss": 1.5234, "grad_norm": 0.6752413511276245, "learning_rate": 0.0002, "epoch": 3.6122177954847277, "step": 4080}, {"loss": 1.5041, "grad_norm": 0.5810418725013733, "learning_rate": 0.0002, "epoch": 3.621071270473661, "step": 4090}, {"loss": 1.5358, "grad_norm": 0.5918573141098022, "learning_rate": 0.0002, "epoch": 3.629924745462594, "step": 4100}, {"loss": 1.499, "grad_norm": 0.6635358333587646, "learning_rate": 0.0002, "epoch": 3.638778220451527, "step": 4110}, {"loss": 1.5021, "grad_norm": 0.5785038471221924, "learning_rate": 0.0002, "epoch": 3.6476316954404604, "step": 4120}, {"loss": 1.5711, "grad_norm": 0.5837879776954651, "learning_rate": 0.0002, "epoch": 3.6564851704293937, "step": 4130}, {"loss": 1.4273, "grad_norm": 0.6449324488639832, "learning_rate": 0.0002, "epoch": 3.6653386454183265, "step": 4140}, {"loss": 1.4608, "grad_norm": 0.6191908717155457, "learning_rate": 0.0002, "epoch": 3.67419212040726, "step": 4150}, {"loss": 1.4567, "grad_norm": 0.6937987208366394, "learning_rate": 0.0002, "epoch": 3.683045595396193, "step": 4160}, {"loss": 1.4136, "grad_norm": 0.581128716468811, "learning_rate": 0.0002, "epoch": 3.6918990703851264, "step": 4170}, {"loss": 1.4204, "grad_norm": 0.6547803282737732, "learning_rate": 0.0002, "epoch": 3.7007525453740593, "step": 4180}, {"loss": 1.4653, "grad_norm": 0.5961150527000427, "learning_rate": 0.0002, "epoch": 3.7096060203629926, "step": 4190}, {"loss": 1.4755, "grad_norm": 0.6197913885116577, "learning_rate": 0.0002, "epoch": 3.718459495351926, "step": 4200}, {"loss": 1.5191, "grad_norm": 0.688565194606781, "learning_rate": 0.0002, "epoch": 3.7273129703408587, "step": 4210}, {"loss": 1.5618, "grad_norm": 0.5832270979881287, "learning_rate": 0.0002, "epoch": 3.736166445329792, "step": 4220}, {"loss": 1.4747, "grad_norm": 0.5643884539604187, "learning_rate": 0.0002, "epoch": 3.7450199203187253, "step": 4230}, {"loss": 1.5242, "grad_norm": 0.6236484050750732, "learning_rate": 0.0002, "epoch": 3.753873395307658, "step": 4240}, {"loss": 1.576, "grad_norm": 0.5367720127105713, "learning_rate": 0.0002, "epoch": 3.7627268702965915, "step": 4250}, {"loss": 1.5234, "grad_norm": 0.5785109400749207, "learning_rate": 0.0002, "epoch": 3.7715803452855248, "step": 4260}, {"loss": 1.4947, "grad_norm": 0.5698465704917908, "learning_rate": 0.0002, "epoch": 3.7804338202744576, "step": 4270}, {"loss": 1.4769, "grad_norm": 0.5748036503791809, "learning_rate": 0.0002, "epoch": 3.789287295263391, "step": 4280}, {"loss": 1.5503, "grad_norm": 0.608147382736206, "learning_rate": 0.0002, "epoch": 3.798140770252324, "step": 4290}, {"loss": 1.5354, "grad_norm": 0.5820456147193909, "learning_rate": 0.0002, "epoch": 3.806994245241257, "step": 4300}, {"loss": 1.5668, "grad_norm": 0.6325612664222717, "learning_rate": 0.0002, "epoch": 3.8158477202301904, "step": 4310}, {"loss": 1.5295, "grad_norm": 0.6465362310409546, "learning_rate": 0.0002, "epoch": 3.8247011952191237, "step": 4320}, {"loss": 1.5048, "grad_norm": 0.5630854368209839, "learning_rate": 0.0002, "epoch": 3.8335546702080565, "step": 4330}, {"loss": 1.5636, "grad_norm": 0.6181462407112122, "learning_rate": 0.0002, "epoch": 3.84240814519699, "step": 4340}, {"loss": 1.5113, "grad_norm": 0.6207571029663086, "learning_rate": 0.0002, "epoch": 3.851261620185923, "step": 4350}, {"loss": 1.5424, "grad_norm": 0.6092919111251831, "learning_rate": 0.0002, "epoch": 3.860115095174856, "step": 4360}, {"loss": 1.5214, "grad_norm": 0.6140493750572205, "learning_rate": 0.0002, "epoch": 3.8689685701637893, "step": 4370}, {"loss": 1.5574, "grad_norm": 0.611575722694397, "learning_rate": 0.0002, "epoch": 3.8778220451527226, "step": 4380}, {"loss": 1.5563, "grad_norm": 0.6288794279098511, "learning_rate": 0.0002, "epoch": 3.8866755201416554, "step": 4390}, {"loss": 1.4967, "grad_norm": 0.6518979072570801, "learning_rate": 0.0002, "epoch": 3.8955289951305887, "step": 4400}, {"loss": 1.5366, "grad_norm": 0.6144753098487854, "learning_rate": 0.0002, "epoch": 3.904382470119522, "step": 4410}, {"loss": 1.6285, "grad_norm": 0.7034937143325806, "learning_rate": 0.0002, "epoch": 3.913235945108455, "step": 4420}, {"loss": 1.4978, "grad_norm": 0.5713187456130981, "learning_rate": 0.0002, "epoch": 3.922089420097388, "step": 4430}, {"loss": 1.5532, "grad_norm": 0.6187576651573181, "learning_rate": 0.0002, "epoch": 3.9309428950863214, "step": 4440}, {"loss": 1.551, "grad_norm": 0.6439383029937744, "learning_rate": 0.0002, "epoch": 3.9397963700752543, "step": 4450}, {"loss": 1.5073, "grad_norm": 0.6133334636688232, "learning_rate": 0.0002, "epoch": 3.9486498450641876, "step": 4460}, {"loss": 1.538, "grad_norm": 0.593463659286499, "learning_rate": 0.0002, "epoch": 3.957503320053121, "step": 4470}, {"loss": 1.5636, "grad_norm": 0.6261998414993286, "learning_rate": 0.0002, "epoch": 3.9663567950420537, "step": 4480}, {"loss": 1.4888, "grad_norm": 0.6153767704963684, "learning_rate": 0.0002, "epoch": 3.975210270030987, "step": 4490}, {"loss": 1.4986, "grad_norm": 0.6184002757072449, "learning_rate": 0.0002, "epoch": 3.9840637450199203, "step": 4500}, {"loss": 1.5134, "grad_norm": 0.5212734341621399, "learning_rate": 0.0002, "epoch": 3.9929172200088536, "step": 4510}]} +{"epoch": 4.999557326250553, "step": 5647, "epoch_duration": 2962.2189841270447, "total_accumulated_duration": 14728.025115013123, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4916, "grad_norm": 0.4775333106517792, "learning_rate": 0.0002, "epoch": 0.008853474988933156, "step": 10}, {"loss": 2.3137, "grad_norm": 0.5485824346542358, "learning_rate": 0.0002, "epoch": 0.017706949977866312, "step": 20}, {"loss": 2.0984, "grad_norm": 0.5675218105316162, "learning_rate": 0.0002, "epoch": 0.02656042496679947, "step": 30}, {"loss": 2.0622, "grad_norm": 0.696494460105896, "learning_rate": 0.0002, "epoch": 0.035413899955732624, "step": 40}, {"loss": 1.9547, "grad_norm": 0.4788398742675781, "learning_rate": 0.0002, "epoch": 0.04426737494466578, "step": 50}, {"loss": 1.8722, "grad_norm": 0.4763128161430359, "learning_rate": 0.0002, "epoch": 0.05312084993359894, "step": 60}, {"loss": 1.8632, "grad_norm": 0.5929698348045349, "learning_rate": 0.0002, "epoch": 0.0619743249225321, "step": 70}, {"loss": 1.9573, "grad_norm": 0.5899396538734436, "learning_rate": 0.0002, "epoch": 0.07082779991146525, "step": 80}, {"loss": 1.8308, "grad_norm": 0.460123747587204, "learning_rate": 0.0002, "epoch": 0.0796812749003984, "step": 90}, {"loss": 1.7615, "grad_norm": 0.4184812009334564, "learning_rate": 0.0002, "epoch": 0.08853474988933156, "step": 100}, {"loss": 1.8079, "grad_norm": 0.4051891267299652, "learning_rate": 0.0002, "epoch": 0.09738822487826472, "step": 110}, {"loss": 1.8911, "grad_norm": 0.3709661066532135, "learning_rate": 0.0002, "epoch": 0.10624169986719788, "step": 120}, {"loss": 1.8695, "grad_norm": 0.4783487915992737, "learning_rate": 0.0002, "epoch": 0.11509517485613104, "step": 130}, {"loss": 1.8602, "grad_norm": 0.36478137969970703, "learning_rate": 0.0002, "epoch": 0.1239486498450642, "step": 140}, {"loss": 1.7814, "grad_norm": 0.4005294442176819, "learning_rate": 0.0002, "epoch": 0.13280212483399734, "step": 150}, {"loss": 1.799, "grad_norm": 0.42357513308525085, "learning_rate": 0.0002, "epoch": 0.1416555998229305, "step": 160}, {"loss": 1.8835, "grad_norm": 0.3913971781730652, "learning_rate": 0.0002, "epoch": 0.15050907481186365, "step": 170}, {"loss": 1.8507, "grad_norm": 0.4650019407272339, "learning_rate": 0.0002, "epoch": 0.1593625498007968, "step": 180}, {"loss": 1.8036, "grad_norm": 0.5545958876609802, "learning_rate": 0.0002, "epoch": 0.16821602478972997, "step": 190}, {"loss": 1.8676, "grad_norm": 0.3669356107711792, "learning_rate": 0.0002, "epoch": 0.17706949977866313, "step": 200}, {"loss": 1.8169, "grad_norm": 0.3683622181415558, "learning_rate": 0.0002, "epoch": 0.18592297476759628, "step": 210}, {"loss": 1.8117, "grad_norm": 0.39825671911239624, "learning_rate": 0.0002, "epoch": 0.19477644975652944, "step": 220}, {"loss": 1.8332, "grad_norm": 0.4298318326473236, "learning_rate": 0.0002, "epoch": 0.2036299247454626, "step": 230}, {"loss": 1.8339, "grad_norm": 0.36111244559288025, "learning_rate": 0.0002, "epoch": 0.21248339973439576, "step": 240}, {"loss": 1.78, "grad_norm": 0.3711858093738556, "learning_rate": 0.0002, "epoch": 0.2213368747233289, "step": 250}, {"loss": 1.8643, "grad_norm": 0.37717559933662415, "learning_rate": 0.0002, "epoch": 0.23019034971226207, "step": 260}, {"loss": 1.7683, "grad_norm": 0.3678877651691437, "learning_rate": 0.0002, "epoch": 0.23904382470119523, "step": 270}, {"loss": 1.8235, "grad_norm": 0.4165912866592407, "learning_rate": 0.0002, "epoch": 0.2478972996901284, "step": 280}, {"loss": 1.8033, "grad_norm": 0.3403240740299225, "learning_rate": 0.0002, "epoch": 0.25675077467906154, "step": 290}, {"loss": 1.8704, "grad_norm": 0.4023234248161316, "learning_rate": 0.0002, "epoch": 0.2656042496679947, "step": 300}, {"loss": 1.7721, "grad_norm": 0.32472360134124756, "learning_rate": 0.0002, "epoch": 0.27445772465692786, "step": 310}, {"loss": 1.8544, "grad_norm": 0.36464595794677734, "learning_rate": 0.0002, "epoch": 0.283311199645861, "step": 320}, {"loss": 1.8168, "grad_norm": 0.3868598937988281, "learning_rate": 0.0002, "epoch": 0.2921646746347942, "step": 330}, {"loss": 1.772, "grad_norm": 0.3123539686203003, "learning_rate": 0.0002, "epoch": 0.3010181496237273, "step": 340}, {"loss": 1.8285, "grad_norm": 0.3392639458179474, "learning_rate": 0.0002, "epoch": 0.3098716246126605, "step": 350}, {"loss": 1.806, "grad_norm": 0.42070651054382324, "learning_rate": 0.0002, "epoch": 0.3187250996015936, "step": 360}, {"loss": 1.8319, "grad_norm": 0.3650900423526764, "learning_rate": 0.0002, "epoch": 0.3275785745905268, "step": 370}, {"loss": 1.8388, "grad_norm": 0.41388973593711853, "learning_rate": 0.0002, "epoch": 0.33643204957945994, "step": 380}, {"loss": 1.79, "grad_norm": 0.36625272035598755, "learning_rate": 0.0002, "epoch": 0.3452855245683931, "step": 390}, {"loss": 1.8271, "grad_norm": 0.3930284082889557, "learning_rate": 0.0002, "epoch": 0.35413899955732625, "step": 400}, {"loss": 1.8664, "grad_norm": 0.3415820300579071, "learning_rate": 0.0002, "epoch": 0.3629924745462594, "step": 410}, {"loss": 1.8885, "grad_norm": 0.4256570041179657, "learning_rate": 0.0002, "epoch": 0.37184594953519257, "step": 420}, {"loss": 1.7728, "grad_norm": 0.3740842938423157, "learning_rate": 0.0002, "epoch": 0.3806994245241257, "step": 430}, {"loss": 1.7676, "grad_norm": 0.334108829498291, "learning_rate": 0.0002, "epoch": 0.3895528995130589, "step": 440}, {"loss": 1.7837, "grad_norm": 0.33186739683151245, "learning_rate": 0.0002, "epoch": 0.398406374501992, "step": 450}, {"loss": 1.8885, "grad_norm": 0.39127954840660095, "learning_rate": 0.0002, "epoch": 0.4072598494909252, "step": 460}, {"loss": 1.8053, "grad_norm": 0.331443727016449, "learning_rate": 0.0002, "epoch": 0.4161133244798583, "step": 470}, {"loss": 1.783, "grad_norm": 0.36834150552749634, "learning_rate": 0.0002, "epoch": 0.4249667994687915, "step": 480}, {"loss": 1.7549, "grad_norm": 0.338123619556427, "learning_rate": 0.0002, "epoch": 0.43382027445772464, "step": 490}, {"loss": 1.795, "grad_norm": 0.3891060948371887, "learning_rate": 0.0002, "epoch": 0.4426737494466578, "step": 500}, {"loss": 1.7639, "grad_norm": 0.3486529290676117, "learning_rate": 0.0002, "epoch": 0.45152722443559096, "step": 510}, {"loss": 1.796, "grad_norm": 0.3635135889053345, "learning_rate": 0.0002, "epoch": 0.46038069942452414, "step": 520}, {"loss": 1.8068, "grad_norm": 0.7706693410873413, "learning_rate": 0.0002, "epoch": 0.4692341744134573, "step": 530}, {"loss": 1.8048, "grad_norm": 0.33725443482398987, "learning_rate": 0.0002, "epoch": 0.47808764940239046, "step": 540}, {"loss": 1.8023, "grad_norm": 0.3127504289150238, "learning_rate": 0.0002, "epoch": 0.4869411243913236, "step": 550}, {"loss": 1.7693, "grad_norm": 0.3527977466583252, "learning_rate": 0.0002, "epoch": 0.4957945993802568, "step": 560}, {"loss": 1.7989, "grad_norm": 0.3574548661708832, "learning_rate": 0.0002, "epoch": 0.5046480743691899, "step": 570}, {"loss": 1.7699, "grad_norm": 0.32787248492240906, "learning_rate": 0.0002, "epoch": 0.5135015493581231, "step": 580}, {"loss": 1.7502, "grad_norm": 0.3309430778026581, "learning_rate": 0.0002, "epoch": 0.5223550243470563, "step": 590}, {"loss": 1.7798, "grad_norm": 0.34276407957077026, "learning_rate": 0.0002, "epoch": 0.5312084993359893, "step": 600}, {"loss": 1.7517, "grad_norm": 0.3343711495399475, "learning_rate": 0.0002, "epoch": 0.5400619743249225, "step": 610}, {"loss": 1.7661, "grad_norm": 0.3193040192127228, "learning_rate": 0.0002, "epoch": 0.5489154493138557, "step": 620}, {"loss": 1.7769, "grad_norm": 0.3059828579425812, "learning_rate": 0.0002, "epoch": 0.5577689243027888, "step": 630}, {"loss": 1.8166, "grad_norm": 0.37237173318862915, "learning_rate": 0.0002, "epoch": 0.566622399291722, "step": 640}, {"loss": 1.7531, "grad_norm": 0.36022549867630005, "learning_rate": 0.0002, "epoch": 0.5754758742806552, "step": 650}, {"loss": 1.771, "grad_norm": 0.34974920749664307, "learning_rate": 0.0002, "epoch": 0.5843293492695883, "step": 660}, {"loss": 1.8226, "grad_norm": 0.37135401368141174, "learning_rate": 0.0002, "epoch": 0.5931828242585214, "step": 670}, {"loss": 1.7456, "grad_norm": 0.3385699689388275, "learning_rate": 0.0002, "epoch": 0.6020362992474546, "step": 680}, {"loss": 1.7696, "grad_norm": 0.36015814542770386, "learning_rate": 0.0002, "epoch": 0.6108897742363878, "step": 690}, {"loss": 1.7892, "grad_norm": 0.3503795564174652, "learning_rate": 0.0002, "epoch": 0.619743249225321, "step": 700}, {"loss": 1.7733, "grad_norm": 0.3447190225124359, "learning_rate": 0.0002, "epoch": 0.628596724214254, "step": 710}, {"loss": 1.794, "grad_norm": 0.3193499445915222, "learning_rate": 0.0002, "epoch": 0.6374501992031872, "step": 720}, {"loss": 1.8046, "grad_norm": 0.37058180570602417, "learning_rate": 0.0002, "epoch": 0.6463036741921204, "step": 730}, {"loss": 1.8391, "grad_norm": 0.42216411232948303, "learning_rate": 0.0002, "epoch": 0.6551571491810536, "step": 740}, {"loss": 1.7142, "grad_norm": 0.3091185688972473, "learning_rate": 0.0002, "epoch": 0.6640106241699867, "step": 750}, {"loss": 1.8624, "grad_norm": 0.33168601989746094, "learning_rate": 0.0002, "epoch": 0.6728640991589199, "step": 760}, {"loss": 1.7123, "grad_norm": 0.31269341707229614, "learning_rate": 0.0002, "epoch": 0.6817175741478531, "step": 770}, {"loss": 1.8526, "grad_norm": 0.36125293374061584, "learning_rate": 0.0002, "epoch": 0.6905710491367862, "step": 780}, {"loss": 1.7478, "grad_norm": 0.3145293593406677, "learning_rate": 0.0002, "epoch": 0.6994245241257193, "step": 790}, {"loss": 1.6545, "grad_norm": 0.3611990809440613, "learning_rate": 0.0002, "epoch": 0.7082779991146525, "step": 800}, {"loss": 1.892, "grad_norm": 0.3165971636772156, "learning_rate": 0.0002, "epoch": 0.7171314741035857, "step": 810}, {"loss": 1.8251, "grad_norm": 0.3364323675632477, "learning_rate": 0.0002, "epoch": 0.7259849490925188, "step": 820}, {"loss": 1.8508, "grad_norm": 0.4310600757598877, "learning_rate": 0.0002, "epoch": 0.734838424081452, "step": 830}, {"loss": 1.7816, "grad_norm": 0.3414389491081238, "learning_rate": 0.0002, "epoch": 0.7436918990703851, "step": 840}, {"loss": 1.8148, "grad_norm": 0.35536202788352966, "learning_rate": 0.0002, "epoch": 0.7525453740593183, "step": 850}, {"loss": 1.8241, "grad_norm": 0.3232460618019104, "learning_rate": 0.0002, "epoch": 0.7613988490482514, "step": 860}, {"loss": 1.7312, "grad_norm": 0.32734858989715576, "learning_rate": 0.0002, "epoch": 0.7702523240371846, "step": 870}, {"loss": 1.7241, "grad_norm": 0.3433493673801422, "learning_rate": 0.0002, "epoch": 0.7791057990261178, "step": 880}, {"loss": 1.7375, "grad_norm": 0.33354780077934265, "learning_rate": 0.0002, "epoch": 0.787959274015051, "step": 890}, {"loss": 1.7314, "grad_norm": 0.30728545784950256, "learning_rate": 0.0002, "epoch": 0.796812749003984, "step": 900}, {"loss": 1.8267, "grad_norm": 0.3373030126094818, "learning_rate": 0.0002, "epoch": 0.8056662239929172, "step": 910}, {"loss": 1.8479, "grad_norm": 0.3468782603740692, "learning_rate": 0.0002, "epoch": 0.8145196989818504, "step": 920}, {"loss": 1.8548, "grad_norm": 0.33520200848579407, "learning_rate": 0.0002, "epoch": 0.8233731739707836, "step": 930}, {"loss": 1.7932, "grad_norm": 0.35207098722457886, "learning_rate": 0.0002, "epoch": 0.8322266489597167, "step": 940}, {"loss": 1.7804, "grad_norm": 0.4000207483768463, "learning_rate": 0.0002, "epoch": 0.8410801239486498, "step": 950}, {"loss": 1.7996, "grad_norm": 0.35362836718559265, "learning_rate": 0.0002, "epoch": 0.849933598937583, "step": 960}, {"loss": 1.7497, "grad_norm": 0.3470745086669922, "learning_rate": 0.0002, "epoch": 0.8587870739265162, "step": 970}, {"loss": 1.8174, "grad_norm": 0.31602704524993896, "learning_rate": 0.0002, "epoch": 0.8676405489154493, "step": 980}, {"loss": 1.7734, "grad_norm": 0.3062942326068878, "learning_rate": 0.0002, "epoch": 0.8764940239043825, "step": 990}, {"loss": 1.7804, "grad_norm": 0.36963850259780884, "learning_rate": 0.0002, "epoch": 0.8853474988933157, "step": 1000}, {"loss": 1.7309, "grad_norm": 0.3384034037590027, "learning_rate": 0.0002, "epoch": 0.8942009738822487, "step": 1010}, {"loss": 1.7945, "grad_norm": 0.30436110496520996, "learning_rate": 0.0002, "epoch": 0.9030544488711819, "step": 1020}, {"loss": 1.7126, "grad_norm": 3.499784469604492, "learning_rate": 0.0002, "epoch": 0.9119079238601151, "step": 1030}, {"loss": 1.7847, "grad_norm": 0.3130280375480652, "learning_rate": 0.0002, "epoch": 0.9207613988490483, "step": 1040}, {"loss": 1.7527, "grad_norm": 0.29976674914360046, "learning_rate": 0.0002, "epoch": 0.9296148738379814, "step": 1050}, {"loss": 1.7753, "grad_norm": 0.35852617025375366, "learning_rate": 0.0002, "epoch": 0.9384683488269145, "step": 1060}, {"loss": 1.7507, "grad_norm": 0.3288591504096985, "learning_rate": 0.0002, "epoch": 0.9473218238158477, "step": 1070}, {"loss": 1.8155, "grad_norm": 0.32641634345054626, "learning_rate": 0.0002, "epoch": 0.9561752988047809, "step": 1080}, {"loss": 1.7912, "grad_norm": 0.3305715322494507, "learning_rate": 0.0002, "epoch": 0.965028773793714, "step": 1090}, {"loss": 1.8368, "grad_norm": 0.30650773644447327, "learning_rate": 0.0002, "epoch": 0.9738822487826472, "step": 1100}, {"loss": 1.6739, "grad_norm": 0.3330624997615814, "learning_rate": 0.0002, "epoch": 0.9827357237715804, "step": 1110}, {"loss": 1.8392, "grad_norm": 0.3173314034938812, "learning_rate": 0.0002, "epoch": 0.9915891987605135, "step": 1120}, {"eval_loss": 1.8095673322677612, "eval_runtime": 82.6312, "eval_samples_per_second": 6.233, "eval_steps_per_second": 0.787, "epoch": 0.9995573262505534, "step": 1129}, {"loss": 1.7997, "grad_norm": 0.3092995882034302, "learning_rate": 0.0002, "epoch": 1.0004426737494467, "step": 1130}, {"loss": 1.6958, "grad_norm": 0.34386494755744934, "learning_rate": 0.0002, "epoch": 1.0092961487383798, "step": 1140}, {"loss": 1.7149, "grad_norm": 0.2887897789478302, "learning_rate": 0.0002, "epoch": 1.0181496237273129, "step": 1150}, {"loss": 1.7377, "grad_norm": 0.3706893026828766, "learning_rate": 0.0002, "epoch": 1.0270030987162462, "step": 1160}, {"loss": 1.6604, "grad_norm": 0.34724316000938416, "learning_rate": 0.0002, "epoch": 1.0358565737051793, "step": 1170}, {"loss": 1.7749, "grad_norm": 0.41001757979393005, "learning_rate": 0.0002, "epoch": 1.0447100486941125, "step": 1180}, {"loss": 1.6332, "grad_norm": 0.34838348627090454, "learning_rate": 0.0002, "epoch": 1.0535635236830456, "step": 1190}, {"loss": 1.7416, "grad_norm": 0.37201181054115295, "learning_rate": 0.0002, "epoch": 1.0624169986719787, "step": 1200}, {"loss": 1.7707, "grad_norm": 0.36871352791786194, "learning_rate": 0.0002, "epoch": 1.071270473660912, "step": 1210}, {"loss": 1.6769, "grad_norm": 0.35687458515167236, "learning_rate": 0.0002, "epoch": 1.080123948649845, "step": 1220}, {"loss": 1.7235, "grad_norm": 0.3864741921424866, "learning_rate": 0.0002, "epoch": 1.0889774236387781, "step": 1230}, {"loss": 1.729, "grad_norm": 0.3496808707714081, "learning_rate": 0.0002, "epoch": 1.0978308986277114, "step": 1240}, {"loss": 1.7192, "grad_norm": 0.3444930911064148, "learning_rate": 0.0002, "epoch": 1.1066843736166445, "step": 1250}, {"loss": 1.6672, "grad_norm": 0.353188693523407, "learning_rate": 0.0002, "epoch": 1.1155378486055776, "step": 1260}, {"loss": 1.7634, "grad_norm": 0.3284400999546051, "learning_rate": 0.0002, "epoch": 1.1243913235945109, "step": 1270}, {"loss": 1.7441, "grad_norm": 0.3545348644256592, "learning_rate": 0.0002, "epoch": 1.133244798583444, "step": 1280}, {"loss": 1.7343, "grad_norm": 0.3489900529384613, "learning_rate": 0.0002, "epoch": 1.1420982735723773, "step": 1290}, {"loss": 1.6399, "grad_norm": 0.40355560183525085, "learning_rate": 0.0002, "epoch": 1.1509517485613103, "step": 1300}, {"loss": 1.7658, "grad_norm": 0.3369944095611572, "learning_rate": 0.0002, "epoch": 1.1598052235502434, "step": 1310}, {"loss": 1.7098, "grad_norm": 0.39141345024108887, "learning_rate": 0.0002, "epoch": 1.1686586985391767, "step": 1320}, {"loss": 1.6628, "grad_norm": 0.36518552899360657, "learning_rate": 0.0002, "epoch": 1.1775121735281098, "step": 1330}, {"loss": 1.6958, "grad_norm": 0.3730056583881378, "learning_rate": 0.0002, "epoch": 1.1863656485170428, "step": 1340}, {"loss": 1.7613, "grad_norm": 0.37711501121520996, "learning_rate": 0.0002, "epoch": 1.1952191235059761, "step": 1350}, {"loss": 1.6423, "grad_norm": 0.3627128005027771, "learning_rate": 0.0002, "epoch": 1.2040725984949092, "step": 1360}, {"loss": 1.7214, "grad_norm": 0.3458651006221771, "learning_rate": 0.0002, "epoch": 1.2129260734838425, "step": 1370}, {"loss": 1.6978, "grad_norm": 0.392395555973053, "learning_rate": 0.0002, "epoch": 1.2217795484727756, "step": 1380}, {"loss": 1.7785, "grad_norm": 0.3353286683559418, "learning_rate": 0.0002, "epoch": 1.2306330234617087, "step": 1390}, {"loss": 1.7019, "grad_norm": 0.9545007944107056, "learning_rate": 0.0002, "epoch": 1.239486498450642, "step": 1400}, {"loss": 1.725, "grad_norm": 0.37037935853004456, "learning_rate": 0.0002, "epoch": 1.248339973439575, "step": 1410}, {"loss": 1.6818, "grad_norm": 0.3831497132778168, "learning_rate": 0.0002, "epoch": 1.257193448428508, "step": 1420}, {"loss": 1.747, "grad_norm": 0.4633576273918152, "learning_rate": 0.0002, "epoch": 1.2660469234174414, "step": 1430}, {"loss": 1.6864, "grad_norm": 0.3690567910671234, "learning_rate": 0.0002, "epoch": 1.2749003984063745, "step": 1440}, {"loss": 1.767, "grad_norm": 0.33980098366737366, "learning_rate": 0.0002, "epoch": 1.2837538733953076, "step": 1450}, {"loss": 1.6989, "grad_norm": 0.3731277287006378, "learning_rate": 0.0002, "epoch": 1.2926073483842409, "step": 1460}, {"loss": 1.6801, "grad_norm": 0.3781551122665405, "learning_rate": 0.0002, "epoch": 1.301460823373174, "step": 1470}, {"loss": 1.7551, "grad_norm": 0.36511561274528503, "learning_rate": 0.0002, "epoch": 1.310314298362107, "step": 1480}, {"loss": 1.6629, "grad_norm": 0.3292245864868164, "learning_rate": 0.0002, "epoch": 1.3191677733510403, "step": 1490}, {"loss": 1.7098, "grad_norm": 0.38758566975593567, "learning_rate": 0.0002, "epoch": 1.3280212483399734, "step": 1500}, {"loss": 1.7364, "grad_norm": 0.3993414044380188, "learning_rate": 0.0002, "epoch": 1.3368747233289067, "step": 1510}, {"loss": 1.7202, "grad_norm": 0.35689303278923035, "learning_rate": 0.0002, "epoch": 1.3457281983178397, "step": 1520}, {"loss": 1.7082, "grad_norm": 0.41849321126937866, "learning_rate": 0.0002, "epoch": 1.354581673306773, "step": 1530}, {"loss": 1.7488, "grad_norm": 0.36752554774284363, "learning_rate": 0.0002, "epoch": 1.3634351482957061, "step": 1540}, {"loss": 1.7032, "grad_norm": 0.36915940046310425, "learning_rate": 0.0002, "epoch": 1.3722886232846392, "step": 1550}, {"loss": 1.6698, "grad_norm": 0.3656710386276245, "learning_rate": 0.0002, "epoch": 1.3811420982735725, "step": 1560}, {"loss": 1.7269, "grad_norm": 0.32055532932281494, "learning_rate": 0.0002, "epoch": 1.3899955732625056, "step": 1570}, {"loss": 1.8, "grad_norm": 0.35031241178512573, "learning_rate": 0.0002, "epoch": 1.3988490482514386, "step": 1580}, {"loss": 1.6667, "grad_norm": 0.44541189074516296, "learning_rate": 0.0002, "epoch": 1.407702523240372, "step": 1590}, {"loss": 1.8624, "grad_norm": 0.36922356486320496, "learning_rate": 0.0002, "epoch": 1.416555998229305, "step": 1600}, {"loss": 1.7011, "grad_norm": 0.3470565974712372, "learning_rate": 0.0002, "epoch": 1.425409473218238, "step": 1610}, {"loss": 1.6912, "grad_norm": 0.3743111193180084, "learning_rate": 0.0002, "epoch": 1.4342629482071714, "step": 1620}, {"loss": 1.752, "grad_norm": 0.3619250953197479, "learning_rate": 0.0002, "epoch": 1.4431164231961044, "step": 1630}, {"loss": 1.6919, "grad_norm": 0.4028145968914032, "learning_rate": 0.0002, "epoch": 1.4519698981850375, "step": 1640}, {"loss": 1.75, "grad_norm": 0.36065351963043213, "learning_rate": 0.0002, "epoch": 1.4608233731739708, "step": 1650}, {"loss": 1.8212, "grad_norm": 0.44304442405700684, "learning_rate": 0.0002, "epoch": 1.469676848162904, "step": 1660}, {"loss": 1.6691, "grad_norm": 0.35770007967948914, "learning_rate": 0.0002, "epoch": 1.478530323151837, "step": 1670}, {"loss": 1.7588, "grad_norm": 0.37584400177001953, "learning_rate": 0.0002, "epoch": 1.4873837981407703, "step": 1680}, {"loss": 1.63, "grad_norm": 0.37151241302490234, "learning_rate": 0.0002, "epoch": 1.4962372731297033, "step": 1690}, {"loss": 1.6675, "grad_norm": 0.36422812938690186, "learning_rate": 0.0002, "epoch": 1.5050907481186364, "step": 1700}, {"loss": 1.7045, "grad_norm": 0.3680015206336975, "learning_rate": 0.0002, "epoch": 1.5139442231075697, "step": 1710}, {"loss": 1.6917, "grad_norm": 0.3356926441192627, "learning_rate": 0.0002, "epoch": 1.522797698096503, "step": 1720}, {"loss": 1.7108, "grad_norm": 0.37887054681777954, "learning_rate": 0.0002, "epoch": 1.531651173085436, "step": 1730}, {"loss": 1.7001, "grad_norm": 0.37052762508392334, "learning_rate": 0.0002, "epoch": 1.5405046480743692, "step": 1740}, {"loss": 1.6677, "grad_norm": 0.333925724029541, "learning_rate": 0.0002, "epoch": 1.5493581230633025, "step": 1750}, {"loss": 1.7159, "grad_norm": 0.3722778558731079, "learning_rate": 0.0002, "epoch": 1.5582115980522355, "step": 1760}, {"loss": 1.6923, "grad_norm": 0.3331141173839569, "learning_rate": 0.0002, "epoch": 1.5670650730411686, "step": 1770}, {"loss": 1.7444, "grad_norm": 0.3670045733451843, "learning_rate": 0.0002, "epoch": 1.575918548030102, "step": 1780}, {"loss": 1.7092, "grad_norm": 0.3769885301589966, "learning_rate": 0.0002, "epoch": 1.584772023019035, "step": 1790}, {"loss": 1.6689, "grad_norm": 0.4266890287399292, "learning_rate": 0.0002, "epoch": 1.593625498007968, "step": 1800}, {"loss": 1.6859, "grad_norm": 0.37174347043037415, "learning_rate": 0.0002, "epoch": 1.6024789729969013, "step": 1810}, {"loss": 1.6793, "grad_norm": 0.3599846363067627, "learning_rate": 0.0002, "epoch": 1.6113324479858344, "step": 1820}, {"loss": 1.6836, "grad_norm": 0.3364820182323456, "learning_rate": 0.0002, "epoch": 1.6201859229747675, "step": 1830}, {"loss": 1.7278, "grad_norm": 0.3874799907207489, "learning_rate": 0.0002, "epoch": 1.6290393979637008, "step": 1840}, {"loss": 1.705, "grad_norm": 0.3706085681915283, "learning_rate": 0.0002, "epoch": 1.6378928729526339, "step": 1850}, {"loss": 1.6761, "grad_norm": 0.3997809886932373, "learning_rate": 0.0002, "epoch": 1.646746347941567, "step": 1860}, {"loss": 1.7983, "grad_norm": 0.4033166170120239, "learning_rate": 0.0002, "epoch": 1.6555998229305002, "step": 1870}, {"loss": 1.6518, "grad_norm": 0.3944370150566101, "learning_rate": 0.0002, "epoch": 1.6644532979194335, "step": 1880}, {"loss": 1.6017, "grad_norm": 0.3467825651168823, "learning_rate": 0.0002, "epoch": 1.6733067729083664, "step": 1890}, {"loss": 1.7462, "grad_norm": 0.35290950536727905, "learning_rate": 0.0002, "epoch": 1.6821602478972997, "step": 1900}, {"loss": 1.7634, "grad_norm": 0.3664521872997284, "learning_rate": 0.0002, "epoch": 1.691013722886233, "step": 1910}, {"loss": 1.7922, "grad_norm": 0.33863595128059387, "learning_rate": 0.0002, "epoch": 1.699867197875166, "step": 1920}, {"loss": 1.7048, "grad_norm": 0.34726113080978394, "learning_rate": 0.0002, "epoch": 1.7087206728640991, "step": 1930}, {"loss": 1.6664, "grad_norm": 0.35060688853263855, "learning_rate": 0.0002, "epoch": 1.7175741478530324, "step": 1940}, {"loss": 1.7577, "grad_norm": 0.33741647005081177, "learning_rate": 0.0002, "epoch": 1.7264276228419655, "step": 1950}, {"loss": 1.6971, "grad_norm": 0.36190304160118103, "learning_rate": 0.0002, "epoch": 1.7352810978308986, "step": 1960}, {"loss": 1.7238, "grad_norm": 0.3412845730781555, "learning_rate": 0.0002, "epoch": 1.7441345728198319, "step": 1970}, {"loss": 1.7038, "grad_norm": 0.3841935694217682, "learning_rate": 0.0002, "epoch": 1.752988047808765, "step": 1980}, {"loss": 1.7185, "grad_norm": 0.39062076807022095, "learning_rate": 0.0002, "epoch": 1.761841522797698, "step": 1990}, {"loss": 1.7346, "grad_norm": 0.3741697669029236, "learning_rate": 0.0002, "epoch": 1.7706949977866313, "step": 2000}, {"loss": 1.6864, "grad_norm": 0.4160231053829193, "learning_rate": 0.0002, "epoch": 1.7795484727755644, "step": 2010}, {"loss": 1.7572, "grad_norm": 0.3602111339569092, "learning_rate": 0.0002, "epoch": 1.7884019477644975, "step": 2020}, {"loss": 1.6139, "grad_norm": 0.36740878224372864, "learning_rate": 0.0002, "epoch": 1.7972554227534308, "step": 2030}, {"loss": 1.7043, "grad_norm": 0.419039249420166, "learning_rate": 0.0002, "epoch": 1.8061088977423638, "step": 2040}, {"loss": 1.7847, "grad_norm": 0.3511838912963867, "learning_rate": 0.0002, "epoch": 1.814962372731297, "step": 2050}, {"loss": 1.6477, "grad_norm": 0.3580166697502136, "learning_rate": 0.0002, "epoch": 1.8238158477202302, "step": 2060}, {"loss": 1.7562, "grad_norm": 0.40928223729133606, "learning_rate": 0.0002, "epoch": 1.8326693227091635, "step": 2070}, {"loss": 1.7356, "grad_norm": 0.37134310603141785, "learning_rate": 0.0002, "epoch": 1.8415227976980963, "step": 2080}, {"loss": 1.6829, "grad_norm": 0.3924112319946289, "learning_rate": 0.0002, "epoch": 1.8503762726870296, "step": 2090}, {"loss": 1.6785, "grad_norm": 0.3215042054653168, "learning_rate": 0.0002, "epoch": 1.859229747675963, "step": 2100}, {"loss": 1.6864, "grad_norm": 0.37674015760421753, "learning_rate": 0.0002, "epoch": 1.868083222664896, "step": 2110}, {"loss": 1.7313, "grad_norm": 0.370856374502182, "learning_rate": 0.0002, "epoch": 1.876936697653829, "step": 2120}, {"loss": 1.7163, "grad_norm": 0.35783782601356506, "learning_rate": 0.0002, "epoch": 1.8857901726427624, "step": 2130}, {"loss": 1.7655, "grad_norm": 0.39538058638572693, "learning_rate": 0.0002, "epoch": 1.8946436476316955, "step": 2140}, {"loss": 1.6614, "grad_norm": 0.36677780747413635, "learning_rate": 0.0002, "epoch": 1.9034971226206285, "step": 2150}, {"loss": 1.6959, "grad_norm": 0.39032700657844543, "learning_rate": 0.0002, "epoch": 1.9123505976095618, "step": 2160}, {"loss": 1.7643, "grad_norm": 0.39762043952941895, "learning_rate": 0.0002, "epoch": 1.921204072598495, "step": 2170}, {"loss": 1.6767, "grad_norm": 0.5400257110595703, "learning_rate": 0.0002, "epoch": 1.930057547587428, "step": 2180}, {"loss": 1.7262, "grad_norm": 0.3650212287902832, "learning_rate": 0.0002, "epoch": 1.9389110225763613, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3583165109157562, "learning_rate": 0.0002, "epoch": 1.9477644975652944, "step": 2200}, {"loss": 1.7241, "grad_norm": 0.4031282365322113, "learning_rate": 0.0002, "epoch": 1.9566179725542274, "step": 2210}, {"loss": 1.7617, "grad_norm": 0.3673221170902252, "learning_rate": 0.0002, "epoch": 1.9654714475431607, "step": 2220}, {"loss": 1.6862, "grad_norm": 0.3920327126979828, "learning_rate": 0.0002, "epoch": 1.9743249225320938, "step": 2230}, {"loss": 1.7192, "grad_norm": 0.4765491783618927, "learning_rate": 0.0002, "epoch": 1.9831783975210269, "step": 2240}, {"loss": 1.7759, "grad_norm": 0.38130584359169006, "learning_rate": 0.0002, "epoch": 1.9920318725099602, "step": 2250}, {"eval_loss": 1.8077166080474854, "eval_runtime": 82.8351, "eval_samples_per_second": 6.217, "eval_steps_per_second": 0.785, "epoch": 2.0, "step": 2259}, {"loss": 1.7081, "grad_norm": 0.34340235590934753, "learning_rate": 0.0002, "epoch": 2.0008853474988935, "step": 2260}, {"loss": 1.6815, "grad_norm": 0.3710762858390808, "learning_rate": 0.0002, "epoch": 2.0097388224878263, "step": 2270}, {"loss": 1.5828, "grad_norm": 0.35640114545822144, "learning_rate": 0.0002, "epoch": 2.0185922974767596, "step": 2280}, {"loss": 1.6322, "grad_norm": 0.45970189571380615, "learning_rate": 0.0002, "epoch": 2.027445772465693, "step": 2290}, {"loss": 1.5598, "grad_norm": 0.4256797134876251, "learning_rate": 0.0002, "epoch": 2.0362992474546258, "step": 2300}, {"loss": 1.6271, "grad_norm": 0.42421531677246094, "learning_rate": 0.0002, "epoch": 2.045152722443559, "step": 2310}, {"loss": 1.6117, "grad_norm": 0.4032478928565979, "learning_rate": 0.0002, "epoch": 2.0540061974324924, "step": 2320}, {"loss": 1.6389, "grad_norm": 0.4073623716831207, "learning_rate": 0.0002, "epoch": 2.062859672421425, "step": 2330}, {"loss": 1.6527, "grad_norm": 0.4845200777053833, "learning_rate": 0.0002, "epoch": 2.0717131474103585, "step": 2340}, {"loss": 1.5734, "grad_norm": 0.40578293800354004, "learning_rate": 0.0002, "epoch": 2.080566622399292, "step": 2350}, {"loss": 1.5853, "grad_norm": 0.4037284255027771, "learning_rate": 0.0002, "epoch": 2.089420097388225, "step": 2360}, {"loss": 1.6511, "grad_norm": 0.4717613160610199, "learning_rate": 0.0002, "epoch": 2.098273572377158, "step": 2370}, {"loss": 1.6273, "grad_norm": 0.42076411843299866, "learning_rate": 0.0002, "epoch": 2.1071270473660912, "step": 2380}, {"loss": 1.654, "grad_norm": 0.47799113392829895, "learning_rate": 0.0002, "epoch": 2.1159805223550245, "step": 2390}, {"loss": 1.5528, "grad_norm": 0.4253084063529968, "learning_rate": 0.0002, "epoch": 2.1248339973439574, "step": 2400}, {"loss": 1.6432, "grad_norm": 0.5023085474967957, "learning_rate": 0.0002, "epoch": 2.1336874723328907, "step": 2410}, {"loss": 1.5926, "grad_norm": 0.49162712693214417, "learning_rate": 0.0002, "epoch": 2.142540947321824, "step": 2420}, {"loss": 1.5779, "grad_norm": 0.39035019278526306, "learning_rate": 0.0002, "epoch": 2.151394422310757, "step": 2430}, {"loss": 1.7526, "grad_norm": 0.43223854899406433, "learning_rate": 0.0002, "epoch": 2.16024789729969, "step": 2440}, {"loss": 1.6334, "grad_norm": 0.4596616327762604, "learning_rate": 0.0002, "epoch": 2.1691013722886234, "step": 2450}, {"loss": 1.6067, "grad_norm": 0.4469447731971741, "learning_rate": 0.0002, "epoch": 2.1779548472775563, "step": 2460}, {"loss": 1.5806, "grad_norm": 0.5100595355033875, "learning_rate": 0.0002, "epoch": 2.1868083222664896, "step": 2470}, {"loss": 1.6456, "grad_norm": 0.4169430732727051, "learning_rate": 0.0002, "epoch": 2.195661797255423, "step": 2480}, {"loss": 1.6734, "grad_norm": 0.4699254035949707, "learning_rate": 0.0002, "epoch": 2.2045152722443557, "step": 2490}, {"loss": 1.6259, "grad_norm": 0.43524250388145447, "learning_rate": 0.0002, "epoch": 2.213368747233289, "step": 2500}, {"loss": 1.6717, "grad_norm": 0.4496648907661438, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2510}, {"loss": 1.6735, "grad_norm": 0.43408212065696716, "learning_rate": 0.0002, "epoch": 2.231075697211155, "step": 2520}, {"loss": 1.611, "grad_norm": 0.4596034288406372, "learning_rate": 0.0002, "epoch": 2.2399291722000885, "step": 2530}, {"loss": 1.6271, "grad_norm": 0.5217021107673645, "learning_rate": 0.0002, "epoch": 2.2487826471890218, "step": 2540}, {"loss": 1.6027, "grad_norm": 0.44745638966560364, "learning_rate": 0.0002, "epoch": 2.2576361221779546, "step": 2550}, {"loss": 1.675, "grad_norm": 0.4484798014163971, "learning_rate": 0.0002, "epoch": 2.266489597166888, "step": 2560}, {"loss": 1.5321, "grad_norm": 0.4428067207336426, "learning_rate": 0.0002, "epoch": 2.275343072155821, "step": 2570}, {"loss": 1.6716, "grad_norm": 0.5095171332359314, "learning_rate": 0.0002, "epoch": 2.2841965471447545, "step": 2580}, {"loss": 1.5661, "grad_norm": 0.44833096861839294, "learning_rate": 0.0002, "epoch": 2.2930500221336874, "step": 2590}, {"loss": 1.652, "grad_norm": 0.507905900478363, "learning_rate": 0.0002, "epoch": 2.3019034971226207, "step": 2600}, {"loss": 1.5963, "grad_norm": 0.40808171033859253, "learning_rate": 0.0002, "epoch": 2.310756972111554, "step": 2610}, {"loss": 1.6574, "grad_norm": 0.4684814214706421, "learning_rate": 0.0002, "epoch": 2.319610447100487, "step": 2620}, {"loss": 1.587, "grad_norm": 0.44864922761917114, "learning_rate": 0.0002, "epoch": 2.32846392208942, "step": 2630}, {"loss": 1.5828, "grad_norm": 0.4174162745475769, "learning_rate": 0.0002, "epoch": 2.3373173970783534, "step": 2640}, {"loss": 1.642, "grad_norm": 0.42314743995666504, "learning_rate": 0.0002, "epoch": 2.3461708720672863, "step": 2650}, {"loss": 1.5884, "grad_norm": 0.49224185943603516, "learning_rate": 0.0002, "epoch": 2.3550243470562195, "step": 2660}, {"loss": 1.5766, "grad_norm": 0.45190292596817017, "learning_rate": 0.0002, "epoch": 2.363877822045153, "step": 2670}, {"loss": 1.6284, "grad_norm": 0.41817107796669006, "learning_rate": 0.0002, "epoch": 2.3727312970340857, "step": 2680}, {"loss": 1.6356, "grad_norm": 0.6436763405799866, "learning_rate": 0.0002, "epoch": 2.381584772023019, "step": 2690}, {"loss": 1.5915, "grad_norm": 0.47175949811935425, "learning_rate": 0.0002, "epoch": 2.3904382470119523, "step": 2700}, {"loss": 1.6303, "grad_norm": 0.480339378118515, "learning_rate": 0.0002, "epoch": 2.3992917220008856, "step": 2710}, {"loss": 1.5697, "grad_norm": 0.4723486006259918, "learning_rate": 0.0002, "epoch": 2.4081451969898184, "step": 2720}, {"loss": 1.54, "grad_norm": 0.4305492043495178, "learning_rate": 0.0002, "epoch": 2.4169986719787517, "step": 2730}, {"loss": 1.71, "grad_norm": 0.5007492303848267, "learning_rate": 0.0002, "epoch": 2.425852146967685, "step": 2740}, {"loss": 1.5369, "grad_norm": 0.5374062061309814, "learning_rate": 0.0002, "epoch": 2.434705621956618, "step": 2750}, {"loss": 1.6156, "grad_norm": 0.45866212248802185, "learning_rate": 0.0002, "epoch": 2.443559096945551, "step": 2760}, {"loss": 1.6066, "grad_norm": 0.47914502024650574, "learning_rate": 0.0002, "epoch": 2.4524125719344845, "step": 2770}, {"loss": 1.5644, "grad_norm": 0.43804746866226196, "learning_rate": 0.0002, "epoch": 2.4612660469234173, "step": 2780}, {"loss": 1.5952, "grad_norm": 0.43656906485557556, "learning_rate": 0.0002, "epoch": 2.4701195219123506, "step": 2790}, {"loss": 1.6311, "grad_norm": 0.4820363521575928, "learning_rate": 0.0002, "epoch": 2.478972996901284, "step": 2800}, {"loss": 1.5375, "grad_norm": 0.4916800558567047, "learning_rate": 0.0002, "epoch": 2.4878264718902168, "step": 2810}, {"loss": 1.5736, "grad_norm": 0.4521256983280182, "learning_rate": 0.0002, "epoch": 2.49667994687915, "step": 2820}, {"loss": 1.6179, "grad_norm": 0.5066806674003601, "learning_rate": 0.0002, "epoch": 2.5055334218680834, "step": 2830}, {"loss": 1.5812, "grad_norm": 0.4768151640892029, "learning_rate": 0.0002, "epoch": 2.514386896857016, "step": 2840}, {"loss": 1.6719, "grad_norm": 0.5144683718681335, "learning_rate": 0.0002, "epoch": 2.5232403718459495, "step": 2850}, {"loss": 1.6063, "grad_norm": 0.4718942940235138, "learning_rate": 0.0002, "epoch": 2.532093846834883, "step": 2860}, {"loss": 1.6099, "grad_norm": 0.4924587309360504, "learning_rate": 0.0002, "epoch": 2.5409473218238157, "step": 2870}, {"loss": 1.5994, "grad_norm": 0.4649953842163086, "learning_rate": 0.0002, "epoch": 2.549800796812749, "step": 2880}, {"loss": 1.6501, "grad_norm": 0.4836665987968445, "learning_rate": 0.0002, "epoch": 2.5586542718016823, "step": 2890}, {"loss": 1.6518, "grad_norm": 0.4162124991416931, "learning_rate": 0.0002, "epoch": 2.567507746790615, "step": 2900}, {"loss": 1.6471, "grad_norm": 0.4894537925720215, "learning_rate": 0.0002, "epoch": 2.5763612217795484, "step": 2910}, {"loss": 1.6123, "grad_norm": 0.4539397358894348, "learning_rate": 0.0002, "epoch": 2.5852146967684817, "step": 2920}, {"loss": 1.6449, "grad_norm": 0.4718773066997528, "learning_rate": 0.0002, "epoch": 2.5940681717574146, "step": 2930}, {"loss": 1.584, "grad_norm": 0.49989837408065796, "learning_rate": 0.0002, "epoch": 2.602921646746348, "step": 2940}, {"loss": 1.6087, "grad_norm": 0.4862406849861145, "learning_rate": 0.0002, "epoch": 2.611775121735281, "step": 2950}, {"loss": 1.6057, "grad_norm": 0.4244804382324219, "learning_rate": 0.0002, "epoch": 2.620628596724214, "step": 2960}, {"loss": 1.7795, "grad_norm": 0.49304354190826416, "learning_rate": 0.0002, "epoch": 2.6294820717131473, "step": 2970}, {"loss": 1.7255, "grad_norm": 0.4818236529827118, "learning_rate": 0.0002, "epoch": 2.6383355467020806, "step": 2980}, {"loss": 1.621, "grad_norm": 0.5077425837516785, "learning_rate": 0.0002, "epoch": 2.647189021691014, "step": 2990}, {"loss": 1.7064, "grad_norm": 0.4494157135486603, "learning_rate": 0.0002, "epoch": 2.6560424966799467, "step": 3000}, {"loss": 1.6792, "grad_norm": 0.4790278971195221, "learning_rate": 0.0002, "epoch": 2.66489597166888, "step": 3010}, {"loss": 1.6082, "grad_norm": 0.4702624976634979, "learning_rate": 0.0002, "epoch": 2.6737494466578133, "step": 3020}, {"loss": 1.6494, "grad_norm": 0.5082133412361145, "learning_rate": 0.0002, "epoch": 2.682602921646746, "step": 3030}, {"loss": 1.6438, "grad_norm": 0.4553256630897522, "learning_rate": 0.0002, "epoch": 2.6914563966356795, "step": 3040}, {"loss": 1.6155, "grad_norm": 0.4492715001106262, "learning_rate": 0.0002, "epoch": 2.700309871624613, "step": 3050}, {"loss": 1.5367, "grad_norm": 0.4555944502353668, "learning_rate": 0.0002, "epoch": 2.709163346613546, "step": 3060}, {"loss": 1.5793, "grad_norm": 0.5879693031311035, "learning_rate": 0.0002, "epoch": 2.718016821602479, "step": 3070}, {"loss": 1.6357, "grad_norm": 0.4628562927246094, "learning_rate": 0.0002, "epoch": 2.7268702965914122, "step": 3080}, {"loss": 1.6585, "grad_norm": 0.5169575810432434, "learning_rate": 0.0002, "epoch": 2.7357237715803455, "step": 3090}, {"loss": 1.562, "grad_norm": 0.4630090892314911, "learning_rate": 0.0002, "epoch": 2.7445772465692784, "step": 3100}, {"loss": 1.5508, "grad_norm": 0.5437219738960266, "learning_rate": 0.0002, "epoch": 2.7534307215582117, "step": 3110}, {"loss": 1.6442, "grad_norm": 0.5102152228355408, "learning_rate": 0.0002, "epoch": 2.762284196547145, "step": 3120}, {"loss": 1.5448, "grad_norm": 0.48287826776504517, "learning_rate": 0.0002, "epoch": 2.771137671536078, "step": 3130}, {"loss": 1.6657, "grad_norm": 0.4671737253665924, "learning_rate": 0.0002, "epoch": 2.779991146525011, "step": 3140}, {"loss": 1.5864, "grad_norm": 0.5177035331726074, "learning_rate": 0.0002, "epoch": 2.7888446215139444, "step": 3150}, {"loss": 1.5617, "grad_norm": 0.450989305973053, "learning_rate": 0.0002, "epoch": 2.7976980965028773, "step": 3160}, {"loss": 1.597, "grad_norm": 0.45007848739624023, "learning_rate": 0.0002, "epoch": 2.8065515714918106, "step": 3170}, {"loss": 1.7179, "grad_norm": 0.4600294530391693, "learning_rate": 0.0002, "epoch": 2.815405046480744, "step": 3180}, {"loss": 1.6441, "grad_norm": 0.485628604888916, "learning_rate": 0.0002, "epoch": 2.8242585214696767, "step": 3190}, {"loss": 1.6396, "grad_norm": 0.49811574816703796, "learning_rate": 0.0002, "epoch": 2.83311199645861, "step": 3200}, {"loss": 1.6067, "grad_norm": 0.5012516975402832, "learning_rate": 0.0002, "epoch": 2.8419654714475433, "step": 3210}, {"loss": 1.6188, "grad_norm": 0.4552757740020752, "learning_rate": 0.0002, "epoch": 2.850818946436476, "step": 3220}, {"loss": 1.5993, "grad_norm": 0.4539635479450226, "learning_rate": 0.0002, "epoch": 2.8596724214254094, "step": 3230}, {"loss": 1.5957, "grad_norm": 0.5534685850143433, "learning_rate": 0.0002, "epoch": 2.8685258964143427, "step": 3240}, {"loss": 1.6065, "grad_norm": 0.4570811688899994, "learning_rate": 0.0002, "epoch": 2.8773793714032756, "step": 3250}, {"loss": 1.6016, "grad_norm": 0.48181653022766113, "learning_rate": 0.0002, "epoch": 2.886232846392209, "step": 3260}, {"loss": 1.6574, "grad_norm": 0.4871032238006592, "learning_rate": 0.0002, "epoch": 2.895086321381142, "step": 3270}, {"loss": 1.5626, "grad_norm": 0.4643239676952362, "learning_rate": 0.0002, "epoch": 2.903939796370075, "step": 3280}, {"loss": 1.5981, "grad_norm": 0.5024484395980835, "learning_rate": 0.0002, "epoch": 2.9127932713590083, "step": 3290}, {"loss": 1.5756, "grad_norm": 0.4425384998321533, "learning_rate": 0.0002, "epoch": 2.9216467463479416, "step": 3300}, {"loss": 1.644, "grad_norm": 0.459168016910553, "learning_rate": 0.0002, "epoch": 2.9305002213368745, "step": 3310}, {"loss": 1.6404, "grad_norm": 0.4950717091560364, "learning_rate": 0.0002, "epoch": 2.939353696325808, "step": 3320}, {"loss": 1.652, "grad_norm": 0.4516230523586273, "learning_rate": 0.0002, "epoch": 2.948207171314741, "step": 3330}, {"loss": 1.5917, "grad_norm": 0.49523285031318665, "learning_rate": 0.0002, "epoch": 2.957060646303674, "step": 3340}, {"loss": 1.733, "grad_norm": 0.49282631278038025, "learning_rate": 0.0002, "epoch": 2.9659141212926072, "step": 3350}, {"loss": 1.6519, "grad_norm": 0.45825016498565674, "learning_rate": 0.0002, "epoch": 2.9747675962815405, "step": 3360}, {"loss": 1.6607, "grad_norm": 0.4952891170978546, "learning_rate": 0.0002, "epoch": 2.983621071270474, "step": 3370}, {"loss": 1.5981, "grad_norm": 0.42182639241218567, "learning_rate": 0.0002, "epoch": 2.9924745462594067, "step": 3380}, {"eval_loss": 1.8308420181274414, "eval_runtime": 82.786, "eval_samples_per_second": 6.221, "eval_steps_per_second": 0.785, "epoch": 2.9995573262505535, "step": 3388}, {"loss": 1.5811, "grad_norm": 0.47721418738365173, "learning_rate": 0.0002, "epoch": 3.00132802124834, "step": 3390}, {"loss": 1.5137, "grad_norm": 0.5284923911094666, "learning_rate": 0.0002, "epoch": 3.0101814962372733, "step": 3400}, {"loss": 1.437, "grad_norm": 0.5607061982154846, "learning_rate": 0.0002, "epoch": 3.019034971226206, "step": 3410}, {"loss": 1.4909, "grad_norm": 0.5271363258361816, "learning_rate": 0.0002, "epoch": 3.0278884462151394, "step": 3420}, {"loss": 1.5645, "grad_norm": 0.48660898208618164, "learning_rate": 0.0002, "epoch": 3.0367419212040727, "step": 3430}, {"loss": 1.4754, "grad_norm": 0.5767933130264282, "learning_rate": 0.0002, "epoch": 3.0455953961930056, "step": 3440}, {"loss": 1.4647, "grad_norm": 0.5591282248497009, "learning_rate": 0.0002, "epoch": 3.054448871181939, "step": 3450}, {"loss": 1.5112, "grad_norm": 0.5870814323425293, "learning_rate": 0.0002, "epoch": 3.063302346170872, "step": 3460}, {"loss": 1.4682, "grad_norm": 0.4861546456813812, "learning_rate": 0.0002, "epoch": 3.072155821159805, "step": 3470}, {"loss": 1.4883, "grad_norm": 0.5238925814628601, "learning_rate": 0.0002, "epoch": 3.0810092961487383, "step": 3480}, {"loss": 1.4855, "grad_norm": 0.5521751046180725, "learning_rate": 0.0002, "epoch": 3.0898627711376716, "step": 3490}, {"loss": 1.4454, "grad_norm": 0.5816575884819031, "learning_rate": 0.0002, "epoch": 3.098716246126605, "step": 3500}, {"loss": 1.5113, "grad_norm": 0.5281513333320618, "learning_rate": 0.0002, "epoch": 3.1075697211155378, "step": 3510}, {"loss": 1.4723, "grad_norm": 0.5847303867340088, "learning_rate": 0.0002, "epoch": 3.116423196104471, "step": 3520}, {"loss": 1.5513, "grad_norm": 0.5683517456054688, "learning_rate": 0.0002, "epoch": 3.1252766710934043, "step": 3530}, {"loss": 1.532, "grad_norm": 0.5177015662193298, "learning_rate": 0.0002, "epoch": 3.134130146082337, "step": 3540}, {"loss": 1.4921, "grad_norm": 0.5922423601150513, "learning_rate": 0.0002, "epoch": 3.1429836210712705, "step": 3550}, {"loss": 1.5329, "grad_norm": 0.7018587589263916, "learning_rate": 0.0002, "epoch": 3.151837096060204, "step": 3560}, {"loss": 1.4677, "grad_norm": 0.6152004599571228, "learning_rate": 0.0002, "epoch": 3.1606905710491366, "step": 3570}, {"loss": 1.4288, "grad_norm": 0.5350717902183533, "learning_rate": 0.0002, "epoch": 3.16954404603807, "step": 3580}, {"loss": 1.4739, "grad_norm": 0.5971009731292725, "learning_rate": 0.0002, "epoch": 3.1783975210270032, "step": 3590}, {"loss": 1.541, "grad_norm": 0.7312001585960388, "learning_rate": 0.0002, "epoch": 3.187250996015936, "step": 3600}, {"loss": 1.5803, "grad_norm": 0.6372535228729248, "learning_rate": 0.0002, "epoch": 3.1961044710048694, "step": 3610}, {"loss": 1.4642, "grad_norm": 0.6098020672798157, "learning_rate": 0.0002, "epoch": 3.2049579459938027, "step": 3620}, {"loss": 1.5149, "grad_norm": 0.5506435632705688, "learning_rate": 0.0002, "epoch": 3.2138114209827355, "step": 3630}, {"loss": 1.4338, "grad_norm": 0.6043022274971008, "learning_rate": 0.0002, "epoch": 3.222664895971669, "step": 3640}, {"loss": 1.5351, "grad_norm": 0.5495519042015076, "learning_rate": 0.0002, "epoch": 3.231518370960602, "step": 3650}, {"loss": 1.3879, "grad_norm": 0.5769572257995605, "learning_rate": 0.0002, "epoch": 3.240371845949535, "step": 3660}, {"loss": 1.4604, "grad_norm": 0.6833786964416504, "learning_rate": 0.0002, "epoch": 3.2492253209384683, "step": 3670}, {"loss": 1.5091, "grad_norm": 0.6962856650352478, "learning_rate": 0.0002, "epoch": 3.2580787959274016, "step": 3680}, {"loss": 1.5212, "grad_norm": 0.6553098559379578, "learning_rate": 0.0002, "epoch": 3.2669322709163344, "step": 3690}, {"loss": 1.5416, "grad_norm": 0.5907557010650635, "learning_rate": 0.0002, "epoch": 3.2757857459052677, "step": 3700}, {"loss": 1.5012, "grad_norm": 0.5712862014770508, "learning_rate": 0.0002, "epoch": 3.284639220894201, "step": 3710}, {"loss": 1.5073, "grad_norm": 0.573820948600769, "learning_rate": 0.0002, "epoch": 3.2934926958831343, "step": 3720}, {"loss": 1.544, "grad_norm": 0.6650304198265076, "learning_rate": 0.0002, "epoch": 3.302346170872067, "step": 3730}, {"loss": 1.5069, "grad_norm": 0.5182583928108215, "learning_rate": 0.0002, "epoch": 3.3111996458610005, "step": 3740}, {"loss": 1.5254, "grad_norm": 0.5078902840614319, "learning_rate": 0.0002, "epoch": 3.3200531208499338, "step": 3750}, {"loss": 1.4881, "grad_norm": 0.7062374353408813, "learning_rate": 0.0002, "epoch": 3.3289065958388666, "step": 3760}, {"loss": 1.5017, "grad_norm": 0.5711262822151184, "learning_rate": 0.0002, "epoch": 3.3377600708278, "step": 3770}, {"loss": 1.4982, "grad_norm": 0.5624606013298035, "learning_rate": 0.0002, "epoch": 3.346613545816733, "step": 3780}, {"loss": 1.4515, "grad_norm": 0.6008231043815613, "learning_rate": 0.0002, "epoch": 3.355467020805666, "step": 3790}, {"loss": 1.5038, "grad_norm": 0.6120018362998962, "learning_rate": 0.0002, "epoch": 3.3643204957945994, "step": 3800}, {"loss": 1.4918, "grad_norm": 0.5679979920387268, "learning_rate": 0.0002, "epoch": 3.3731739707835326, "step": 3810}, {"loss": 1.5435, "grad_norm": 0.5613794922828674, "learning_rate": 0.0002, "epoch": 3.3820274457724655, "step": 3820}, {"loss": 1.5319, "grad_norm": 0.5328839421272278, "learning_rate": 0.0002, "epoch": 3.390880920761399, "step": 3830}, {"loss": 1.5262, "grad_norm": 0.5960017442703247, "learning_rate": 0.0002, "epoch": 3.399734395750332, "step": 3840}, {"loss": 1.4227, "grad_norm": 0.5264106392860413, "learning_rate": 0.0002, "epoch": 3.4085878707392654, "step": 3850}, {"loss": 1.4766, "grad_norm": 0.6378359198570251, "learning_rate": 0.0002, "epoch": 3.4174413457281982, "step": 3860}, {"loss": 1.4898, "grad_norm": 0.5792967677116394, "learning_rate": 0.0002, "epoch": 3.4262948207171315, "step": 3870}, {"loss": 1.4914, "grad_norm": 0.6836280822753906, "learning_rate": 0.0002, "epoch": 3.435148295706065, "step": 3880}, {"loss": 1.5002, "grad_norm": 0.6073971390724182, "learning_rate": 0.0002, "epoch": 3.4440017706949977, "step": 3890}, {"loss": 1.4473, "grad_norm": 0.5753195881843567, "learning_rate": 0.0002, "epoch": 3.452855245683931, "step": 3900}, {"loss": 1.5332, "grad_norm": 0.6007646918296814, "learning_rate": 0.0002, "epoch": 3.4617087206728643, "step": 3910}, {"loss": 1.515, "grad_norm": 0.6025636196136475, "learning_rate": 0.0002, "epoch": 3.470562195661797, "step": 3920}, {"loss": 1.4612, "grad_norm": 0.6819562315940857, "learning_rate": 0.0002, "epoch": 3.4794156706507304, "step": 3930}, {"loss": 1.518, "grad_norm": 0.6448395848274231, "learning_rate": 0.0002, "epoch": 3.4882691456396637, "step": 3940}, {"loss": 1.5194, "grad_norm": 0.5712178945541382, "learning_rate": 0.0002, "epoch": 3.4971226206285966, "step": 3950}, {"loss": 1.4757, "grad_norm": 0.6300532817840576, "learning_rate": 0.0002, "epoch": 3.50597609561753, "step": 3960}, {"loss": 1.5142, "grad_norm": 0.6120840907096863, "learning_rate": 0.0002, "epoch": 3.514829570606463, "step": 3970}, {"loss": 1.559, "grad_norm": 0.6887575387954712, "learning_rate": 0.0002, "epoch": 3.523683045595396, "step": 3980}, {"loss": 1.5591, "grad_norm": 0.6970235109329224, "learning_rate": 0.0002, "epoch": 3.5325365205843293, "step": 3990}, {"loss": 1.5198, "grad_norm": 0.5818213820457458, "learning_rate": 0.0002, "epoch": 3.5413899955732626, "step": 4000}, {"loss": 1.5367, "grad_norm": 1.0533310174942017, "learning_rate": 0.0002, "epoch": 3.5502434705621955, "step": 4010}, {"loss": 1.5399, "grad_norm": 0.5444280505180359, "learning_rate": 0.0002, "epoch": 3.5590969455511288, "step": 4020}, {"loss": 1.5573, "grad_norm": 0.6007506847381592, "learning_rate": 0.0002, "epoch": 3.567950420540062, "step": 4030}, {"loss": 1.5059, "grad_norm": 0.6088743805885315, "learning_rate": 0.0002, "epoch": 3.576803895528995, "step": 4040}, {"loss": 1.5174, "grad_norm": 0.5934239029884338, "learning_rate": 0.0002, "epoch": 3.585657370517928, "step": 4050}, {"loss": 1.4938, "grad_norm": 0.605251669883728, "learning_rate": 0.0002, "epoch": 3.5945108455068615, "step": 4060}, {"loss": 1.5142, "grad_norm": 0.5903469920158386, "learning_rate": 0.0002, "epoch": 3.6033643204957944, "step": 4070}, {"loss": 1.5234, "grad_norm": 0.6752413511276245, "learning_rate": 0.0002, "epoch": 3.6122177954847277, "step": 4080}, {"loss": 1.5041, "grad_norm": 0.5810418725013733, "learning_rate": 0.0002, "epoch": 3.621071270473661, "step": 4090}, {"loss": 1.5358, "grad_norm": 0.5918573141098022, "learning_rate": 0.0002, "epoch": 3.629924745462594, "step": 4100}, {"loss": 1.499, "grad_norm": 0.6635358333587646, "learning_rate": 0.0002, "epoch": 3.638778220451527, "step": 4110}, {"loss": 1.5021, "grad_norm": 0.5785038471221924, "learning_rate": 0.0002, "epoch": 3.6476316954404604, "step": 4120}, {"loss": 1.5711, "grad_norm": 0.5837879776954651, "learning_rate": 0.0002, "epoch": 3.6564851704293937, "step": 4130}, {"loss": 1.4273, "grad_norm": 0.6449324488639832, "learning_rate": 0.0002, "epoch": 3.6653386454183265, "step": 4140}, {"loss": 1.4608, "grad_norm": 0.6191908717155457, "learning_rate": 0.0002, "epoch": 3.67419212040726, "step": 4150}, {"loss": 1.4567, "grad_norm": 0.6937987208366394, "learning_rate": 0.0002, "epoch": 3.683045595396193, "step": 4160}, {"loss": 1.4136, "grad_norm": 0.581128716468811, "learning_rate": 0.0002, "epoch": 3.6918990703851264, "step": 4170}, {"loss": 1.4204, "grad_norm": 0.6547803282737732, "learning_rate": 0.0002, "epoch": 3.7007525453740593, "step": 4180}, {"loss": 1.4653, "grad_norm": 0.5961150527000427, "learning_rate": 0.0002, "epoch": 3.7096060203629926, "step": 4190}, {"loss": 1.4755, "grad_norm": 0.6197913885116577, "learning_rate": 0.0002, "epoch": 3.718459495351926, "step": 4200}, {"loss": 1.5191, "grad_norm": 0.688565194606781, "learning_rate": 0.0002, "epoch": 3.7273129703408587, "step": 4210}, {"loss": 1.5618, "grad_norm": 0.5832270979881287, "learning_rate": 0.0002, "epoch": 3.736166445329792, "step": 4220}, {"loss": 1.4747, "grad_norm": 0.5643884539604187, "learning_rate": 0.0002, "epoch": 3.7450199203187253, "step": 4230}, {"loss": 1.5242, "grad_norm": 0.6236484050750732, "learning_rate": 0.0002, "epoch": 3.753873395307658, "step": 4240}, {"loss": 1.576, "grad_norm": 0.5367720127105713, "learning_rate": 0.0002, "epoch": 3.7627268702965915, "step": 4250}, {"loss": 1.5234, "grad_norm": 0.5785109400749207, "learning_rate": 0.0002, "epoch": 3.7715803452855248, "step": 4260}, {"loss": 1.4947, "grad_norm": 0.5698465704917908, "learning_rate": 0.0002, "epoch": 3.7804338202744576, "step": 4270}, {"loss": 1.4769, "grad_norm": 0.5748036503791809, "learning_rate": 0.0002, "epoch": 3.789287295263391, "step": 4280}, {"loss": 1.5503, "grad_norm": 0.608147382736206, "learning_rate": 0.0002, "epoch": 3.798140770252324, "step": 4290}, {"loss": 1.5354, "grad_norm": 0.5820456147193909, "learning_rate": 0.0002, "epoch": 3.806994245241257, "step": 4300}, {"loss": 1.5668, "grad_norm": 0.6325612664222717, "learning_rate": 0.0002, "epoch": 3.8158477202301904, "step": 4310}, {"loss": 1.5295, "grad_norm": 0.6465362310409546, "learning_rate": 0.0002, "epoch": 3.8247011952191237, "step": 4320}, {"loss": 1.5048, "grad_norm": 0.5630854368209839, "learning_rate": 0.0002, "epoch": 3.8335546702080565, "step": 4330}, {"loss": 1.5636, "grad_norm": 0.6181462407112122, "learning_rate": 0.0002, "epoch": 3.84240814519699, "step": 4340}, {"loss": 1.5113, "grad_norm": 0.6207571029663086, "learning_rate": 0.0002, "epoch": 3.851261620185923, "step": 4350}, {"loss": 1.5424, "grad_norm": 0.6092919111251831, "learning_rate": 0.0002, "epoch": 3.860115095174856, "step": 4360}, {"loss": 1.5214, "grad_norm": 0.6140493750572205, "learning_rate": 0.0002, "epoch": 3.8689685701637893, "step": 4370}, {"loss": 1.5574, "grad_norm": 0.611575722694397, "learning_rate": 0.0002, "epoch": 3.8778220451527226, "step": 4380}, {"loss": 1.5563, "grad_norm": 0.6288794279098511, "learning_rate": 0.0002, "epoch": 3.8866755201416554, "step": 4390}, {"loss": 1.4967, "grad_norm": 0.6518979072570801, "learning_rate": 0.0002, "epoch": 3.8955289951305887, "step": 4400}, {"loss": 1.5366, "grad_norm": 0.6144753098487854, "learning_rate": 0.0002, "epoch": 3.904382470119522, "step": 4410}, {"loss": 1.6285, "grad_norm": 0.7034937143325806, "learning_rate": 0.0002, "epoch": 3.913235945108455, "step": 4420}, {"loss": 1.4978, "grad_norm": 0.5713187456130981, "learning_rate": 0.0002, "epoch": 3.922089420097388, "step": 4430}, {"loss": 1.5532, "grad_norm": 0.6187576651573181, "learning_rate": 0.0002, "epoch": 3.9309428950863214, "step": 4440}, {"loss": 1.551, "grad_norm": 0.6439383029937744, "learning_rate": 0.0002, "epoch": 3.9397963700752543, "step": 4450}, {"loss": 1.5073, "grad_norm": 0.6133334636688232, "learning_rate": 0.0002, "epoch": 3.9486498450641876, "step": 4460}, {"loss": 1.538, "grad_norm": 0.593463659286499, "learning_rate": 0.0002, "epoch": 3.957503320053121, "step": 4470}, {"loss": 1.5636, "grad_norm": 0.6261998414993286, "learning_rate": 0.0002, "epoch": 3.9663567950420537, "step": 4480}, {"loss": 1.4888, "grad_norm": 0.6153767704963684, "learning_rate": 0.0002, "epoch": 3.975210270030987, "step": 4490}, {"loss": 1.4986, "grad_norm": 0.6184002757072449, "learning_rate": 0.0002, "epoch": 3.9840637450199203, "step": 4500}, {"loss": 1.5134, "grad_norm": 0.5212734341621399, "learning_rate": 0.0002, "epoch": 3.9929172200088536, "step": 4510}, {"eval_loss": 1.8745536804199219, "eval_runtime": 83.0125, "eval_samples_per_second": 6.204, "eval_steps_per_second": 0.783, "epoch": 4.0, "step": 4518}, {"loss": 1.4708, "grad_norm": 0.5871603488922119, "learning_rate": 0.0002, "epoch": 4.001770694997787, "step": 4520}, {"loss": 1.4139, "grad_norm": 0.6746091842651367, "learning_rate": 0.0002, "epoch": 4.01062416998672, "step": 4530}, {"loss": 1.3625, "grad_norm": 0.6159639358520508, "learning_rate": 0.0002, "epoch": 4.019477644975653, "step": 4540}, {"loss": 1.3766, "grad_norm": 0.7529398202896118, "learning_rate": 0.0002, "epoch": 4.028331119964586, "step": 4550}, {"loss": 1.3202, "grad_norm": 0.788398027420044, "learning_rate": 0.0002, "epoch": 4.037184594953519, "step": 4560}, {"loss": 1.4254, "grad_norm": 0.9679850935935974, "learning_rate": 0.0002, "epoch": 4.046038069942452, "step": 4570}, {"loss": 1.2911, "grad_norm": 0.6305310130119324, "learning_rate": 0.0002, "epoch": 4.054891544931386, "step": 4580}, {"loss": 1.3525, "grad_norm": 0.8557451963424683, "learning_rate": 0.0002, "epoch": 4.063745019920319, "step": 4590}, {"loss": 1.3901, "grad_norm": 0.741518497467041, "learning_rate": 0.0002, "epoch": 4.0725984949092515, "step": 4600}, {"loss": 1.3374, "grad_norm": 0.6573862433433533, "learning_rate": 0.0002, "epoch": 4.081451969898185, "step": 4610}, {"loss": 1.3341, "grad_norm": 0.6926319599151611, "learning_rate": 0.0002, "epoch": 4.090305444887118, "step": 4620}, {"loss": 1.4176, "grad_norm": 0.9212626218795776, "learning_rate": 0.0002, "epoch": 4.099158919876051, "step": 4630}, {"loss": 1.3402, "grad_norm": 0.7167867422103882, "learning_rate": 0.0002, "epoch": 4.108012394864985, "step": 4640}, {"loss": 1.3333, "grad_norm": 0.6691595911979675, "learning_rate": 0.0002, "epoch": 4.116865869853918, "step": 4650}, {"loss": 1.247, "grad_norm": 0.8708247542381287, "learning_rate": 0.0002, "epoch": 4.12571934484285, "step": 4660}, {"loss": 1.3599, "grad_norm": 0.8612170219421387, "learning_rate": 0.0002, "epoch": 4.134572819831784, "step": 4670}, {"loss": 1.3418, "grad_norm": 0.7688325047492981, "learning_rate": 0.0002, "epoch": 4.143426294820717, "step": 4680}, {"loss": 1.4349, "grad_norm": 0.7606917023658752, "learning_rate": 0.0002, "epoch": 4.152279769809651, "step": 4690}, {"loss": 1.3521, "grad_norm": 0.8241282105445862, "learning_rate": 0.0002, "epoch": 4.161133244798584, "step": 4700}, {"loss": 1.3325, "grad_norm": 0.7480464577674866, "learning_rate": 0.0002, "epoch": 4.1699867197875164, "step": 4710}, {"loss": 1.4027, "grad_norm": 0.7092460989952087, "learning_rate": 0.0002, "epoch": 4.17884019477645, "step": 4720}, {"loss": 1.4005, "grad_norm": 0.8782108426094055, "learning_rate": 0.0002, "epoch": 4.187693669765383, "step": 4730}, {"loss": 1.3626, "grad_norm": 0.6875300407409668, "learning_rate": 0.0002, "epoch": 4.196547144754316, "step": 4740}, {"loss": 1.3798, "grad_norm": 0.7713887691497803, "learning_rate": 0.0002, "epoch": 4.20540061974325, "step": 4750}, {"loss": 1.3822, "grad_norm": 0.8270819783210754, "learning_rate": 0.0002, "epoch": 4.2142540947321825, "step": 4760}, {"loss": 1.3559, "grad_norm": 0.7109288573265076, "learning_rate": 0.0002, "epoch": 4.223107569721115, "step": 4770}, {"loss": 1.3948, "grad_norm": 0.7209359407424927, "learning_rate": 0.0002, "epoch": 4.231961044710049, "step": 4780}, {"loss": 1.3691, "grad_norm": 0.7142833471298218, "learning_rate": 0.0002, "epoch": 4.240814519698982, "step": 4790}, {"loss": 1.3654, "grad_norm": 0.8526809811592102, "learning_rate": 0.0002, "epoch": 4.249667994687915, "step": 4800}, {"loss": 1.3819, "grad_norm": 0.7064695954322815, "learning_rate": 0.0002, "epoch": 4.2585214696768485, "step": 4810}, {"loss": 1.3333, "grad_norm": 0.7646124362945557, "learning_rate": 0.0002, "epoch": 4.267374944665781, "step": 4820}, {"loss": 1.4247, "grad_norm": 0.7377115488052368, "learning_rate": 0.0002, "epoch": 4.276228419654714, "step": 4830}, {"loss": 1.3683, "grad_norm": 0.7308453321456909, "learning_rate": 0.0002, "epoch": 4.285081894643648, "step": 4840}, {"loss": 1.3653, "grad_norm": 0.6687684059143066, "learning_rate": 0.0002, "epoch": 4.293935369632581, "step": 4850}, {"loss": 1.3538, "grad_norm": 0.7447634339332581, "learning_rate": 0.0002, "epoch": 4.302788844621514, "step": 4860}, {"loss": 1.3842, "grad_norm": 0.7661601305007935, "learning_rate": 0.0002, "epoch": 4.311642319610447, "step": 4870}, {"loss": 1.3783, "grad_norm": 0.7492215037345886, "learning_rate": 0.0002, "epoch": 4.32049579459938, "step": 4880}, {"loss": 1.4089, "grad_norm": 0.9554458856582642, "learning_rate": 0.0002, "epoch": 4.329349269588313, "step": 4890}, {"loss": 1.3582, "grad_norm": 0.7409822940826416, "learning_rate": 0.0002, "epoch": 4.338202744577247, "step": 4900}, {"loss": 1.2581, "grad_norm": 0.9848645329475403, "learning_rate": 0.0002, "epoch": 4.34705621956618, "step": 4910}, {"loss": 1.3809, "grad_norm": 0.803995668888092, "learning_rate": 0.0002, "epoch": 4.355909694555113, "step": 4920}, {"loss": 1.3585, "grad_norm": 0.7480606436729431, "learning_rate": 0.0002, "epoch": 4.364763169544046, "step": 4930}, {"loss": 1.4092, "grad_norm": 0.7018141150474548, "learning_rate": 0.0002, "epoch": 4.373616644532979, "step": 4940}, {"loss": 1.4034, "grad_norm": 0.7684932351112366, "learning_rate": 0.0002, "epoch": 4.382470119521912, "step": 4950}, {"loss": 1.3937, "grad_norm": 0.7849185466766357, "learning_rate": 0.0002, "epoch": 4.391323594510846, "step": 4960}, {"loss": 1.3763, "grad_norm": 0.7858862280845642, "learning_rate": 0.0002, "epoch": 4.400177069499779, "step": 4970}, {"loss": 1.3901, "grad_norm": 0.8270778059959412, "learning_rate": 0.0002, "epoch": 4.4090305444887115, "step": 4980}, {"loss": 1.445, "grad_norm": 0.8464101552963257, "learning_rate": 0.0002, "epoch": 4.417884019477645, "step": 4990}, {"loss": 1.3586, "grad_norm": 0.85670405626297, "learning_rate": 0.0002, "epoch": 4.426737494466578, "step": 5000}, {"loss": 1.4203, "grad_norm": 0.8656655550003052, "learning_rate": 0.0002, "epoch": 4.435590969455511, "step": 5010}, {"loss": 1.3426, "grad_norm": 0.7605292201042175, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 5020}, {"loss": 1.3803, "grad_norm": 0.7682471871376038, "learning_rate": 0.0002, "epoch": 4.4532979194333775, "step": 5030}, {"loss": 1.3432, "grad_norm": 0.7209102511405945, "learning_rate": 0.0002, "epoch": 4.46215139442231, "step": 5040}, {"loss": 1.5126, "grad_norm": 0.8259989023208618, "learning_rate": 0.0002, "epoch": 4.471004869411244, "step": 5050}, {"loss": 1.3709, "grad_norm": 0.7342197895050049, "learning_rate": 0.0002, "epoch": 4.479858344400177, "step": 5060}, {"loss": 1.4196, "grad_norm": 0.7869040369987488, "learning_rate": 0.0002, "epoch": 4.48871181938911, "step": 5070}, {"loss": 1.3734, "grad_norm": 0.7906143665313721, "learning_rate": 0.0002, "epoch": 4.4975652943780435, "step": 5080}, {"loss": 1.3555, "grad_norm": 0.7336861491203308, "learning_rate": 0.0002, "epoch": 4.506418769366976, "step": 5090}, {"loss": 1.3768, "grad_norm": 0.8264166712760925, "learning_rate": 0.0002, "epoch": 4.515272244355909, "step": 5100}, {"loss": 1.3822, "grad_norm": 0.8144693970680237, "learning_rate": 0.0002, "epoch": 4.524125719344843, "step": 5110}, {"loss": 1.3044, "grad_norm": 0.8257269263267517, "learning_rate": 0.0002, "epoch": 4.532979194333776, "step": 5120}, {"loss": 1.3501, "grad_norm": 0.8838174343109131, "learning_rate": 0.0002, "epoch": 4.541832669322709, "step": 5130}, {"loss": 1.3464, "grad_norm": 0.7081145644187927, "learning_rate": 0.0002, "epoch": 4.550686144311642, "step": 5140}, {"loss": 1.342, "grad_norm": 0.7137823700904846, "learning_rate": 0.0002, "epoch": 4.559539619300575, "step": 5150}, {"loss": 1.3788, "grad_norm": 0.7890386581420898, "learning_rate": 0.0002, "epoch": 4.568393094289509, "step": 5160}, {"loss": 1.3368, "grad_norm": 0.6418015360832214, "learning_rate": 0.0002, "epoch": 4.577246569278442, "step": 5170}, {"loss": 1.3892, "grad_norm": 0.768373966217041, "learning_rate": 0.0002, "epoch": 4.586100044267375, "step": 5180}, {"loss": 1.3953, "grad_norm": 0.6934067606925964, "learning_rate": 0.0002, "epoch": 4.5949535192563085, "step": 5190}, {"loss": 1.3782, "grad_norm": 0.9430719017982483, "learning_rate": 0.0002, "epoch": 4.603806994245241, "step": 5200}, {"loss": 1.3981, "grad_norm": 0.880264163017273, "learning_rate": 0.0002, "epoch": 4.612660469234174, "step": 5210}, {"loss": 1.3506, "grad_norm": 0.7584623098373413, "learning_rate": 0.0002, "epoch": 4.621513944223108, "step": 5220}, {"loss": 1.3973, "grad_norm": 0.7974506616592407, "learning_rate": 0.0002, "epoch": 4.630367419212041, "step": 5230}, {"loss": 1.3818, "grad_norm": 0.8812133073806763, "learning_rate": 0.0002, "epoch": 4.639220894200974, "step": 5240}, {"loss": 1.4002, "grad_norm": 0.8968724012374878, "learning_rate": 0.0002, "epoch": 4.648074369189907, "step": 5250}, {"loss": 1.3327, "grad_norm": 0.7317764759063721, "learning_rate": 0.0002, "epoch": 4.65692784417884, "step": 5260}, {"loss": 1.4363, "grad_norm": 0.7415484189987183, "learning_rate": 0.0002, "epoch": 4.665781319167773, "step": 5270}, {"loss": 1.3673, "grad_norm": 0.7867009043693542, "learning_rate": 0.0002, "epoch": 4.674634794156707, "step": 5280}, {"loss": 1.4246, "grad_norm": 0.6895416378974915, "learning_rate": 0.0002, "epoch": 4.68348826914564, "step": 5290}, {"loss": 1.3438, "grad_norm": 0.7324506640434265, "learning_rate": 0.0002, "epoch": 4.6923417441345725, "step": 5300}, {"loss": 1.4072, "grad_norm": 0.7383193969726562, "learning_rate": 0.0002, "epoch": 4.701195219123506, "step": 5310}, {"loss": 1.3269, "grad_norm": 0.8254916071891785, "learning_rate": 0.0002, "epoch": 4.710048694112439, "step": 5320}, {"loss": 1.4317, "grad_norm": 0.8161033987998962, "learning_rate": 0.0002, "epoch": 4.718902169101372, "step": 5330}, {"loss": 1.3623, "grad_norm": 0.7664386034011841, "learning_rate": 0.0002, "epoch": 4.727755644090306, "step": 5340}, {"loss": 1.4293, "grad_norm": 0.7465475797653198, "learning_rate": 0.0002, "epoch": 4.7366091190792385, "step": 5350}, {"loss": 1.3435, "grad_norm": 0.7810078263282776, "learning_rate": 0.0002, "epoch": 4.745462594068171, "step": 5360}, {"loss": 1.4489, "grad_norm": 0.7428439855575562, "learning_rate": 0.0002, "epoch": 4.754316069057105, "step": 5370}, {"loss": 1.3607, "grad_norm": 0.9548320174217224, "learning_rate": 0.0002, "epoch": 4.763169544046038, "step": 5380}, {"loss": 1.3398, "grad_norm": 0.7959533333778381, "learning_rate": 0.0002, "epoch": 4.772023019034972, "step": 5390}, {"loss": 1.3448, "grad_norm": 0.747473418712616, "learning_rate": 0.0002, "epoch": 4.780876494023905, "step": 5400}, {"loss": 1.3954, "grad_norm": 0.7863122820854187, "learning_rate": 0.0002, "epoch": 4.789729969012837, "step": 5410}, {"loss": 1.4166, "grad_norm": 0.7769626379013062, "learning_rate": 0.0002, "epoch": 4.798583444001771, "step": 5420}, {"loss": 1.4484, "grad_norm": 0.8551191091537476, "learning_rate": 0.0002, "epoch": 4.807436918990704, "step": 5430}, {"loss": 1.4314, "grad_norm": 0.8364850878715515, "learning_rate": 0.0002, "epoch": 4.816290393979637, "step": 5440}, {"loss": 1.4028, "grad_norm": 0.7458856701850891, "learning_rate": 0.0002, "epoch": 4.825143868968571, "step": 5450}, {"loss": 1.3923, "grad_norm": 0.7558291554450989, "learning_rate": 0.0002, "epoch": 4.8339973439575035, "step": 5460}, {"loss": 1.3343, "grad_norm": 0.8396534323692322, "learning_rate": 0.0002, "epoch": 4.842850818946436, "step": 5470}, {"loss": 1.3853, "grad_norm": 0.7790794968605042, "learning_rate": 0.0002, "epoch": 4.85170429393537, "step": 5480}, {"loss": 1.406, "grad_norm": 0.8607641458511353, "learning_rate": 0.0002, "epoch": 4.860557768924303, "step": 5490}, {"loss": 1.4011, "grad_norm": 0.828134298324585, "learning_rate": 0.0002, "epoch": 4.869411243913236, "step": 5500}, {"loss": 1.4089, "grad_norm": 0.8783106803894043, "learning_rate": 0.0002, "epoch": 4.8782647189021695, "step": 5510}, {"loss": 1.4565, "grad_norm": 0.7476183176040649, "learning_rate": 0.0002, "epoch": 4.887118193891102, "step": 5520}, {"loss": 1.3974, "grad_norm": 0.8023254871368408, "learning_rate": 0.0002, "epoch": 4.895971668880035, "step": 5530}, {"loss": 1.2979, "grad_norm": 0.8021706938743591, "learning_rate": 0.0002, "epoch": 4.904825143868969, "step": 5540}, {"loss": 1.4139, "grad_norm": 0.7873618602752686, "learning_rate": 0.0002, "epoch": 4.913678618857902, "step": 5550}, {"loss": 1.4393, "grad_norm": 0.7181428670883179, "learning_rate": 0.0002, "epoch": 4.922532093846835, "step": 5560}, {"loss": 1.3968, "grad_norm": 0.7464273571968079, "learning_rate": 0.0002, "epoch": 4.931385568835768, "step": 5570}, {"loss": 1.3184, "grad_norm": 0.7433671355247498, "learning_rate": 0.0002, "epoch": 4.940239043824701, "step": 5580}, {"loss": 1.4174, "grad_norm": 0.7571114301681519, "learning_rate": 0.0002, "epoch": 4.949092518813634, "step": 5590}, {"loss": 1.4418, "grad_norm": 0.7811630964279175, "learning_rate": 0.0002, "epoch": 4.957945993802568, "step": 5600}, {"loss": 1.4288, "grad_norm": 0.7609148621559143, "learning_rate": 0.0002, "epoch": 4.966799468791501, "step": 5610}, {"loss": 1.3786, "grad_norm": 0.7324382066726685, "learning_rate": 0.0002, "epoch": 4.9756529437804335, "step": 5620}, {"loss": 1.4557, "grad_norm": 0.9249559640884399, "learning_rate": 0.0002, "epoch": 4.984506418769367, "step": 5630}, {"loss": 1.4064, "grad_norm": 0.7852522134780884, "learning_rate": 0.0002, "epoch": 4.9933598937583, "step": 5640}]} +{"epoch": 6.0, "step": 6777, "epoch_duration": 2961.419434785843, "total_accumulated_duration": 17689.444549798965, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4916, "grad_norm": 0.4775333106517792, "learning_rate": 0.0002, "epoch": 0.008853474988933156, "step": 10}, {"loss": 2.3137, "grad_norm": 0.5485824346542358, "learning_rate": 0.0002, "epoch": 0.017706949977866312, "step": 20}, {"loss": 2.0984, "grad_norm": 0.5675218105316162, "learning_rate": 0.0002, "epoch": 0.02656042496679947, "step": 30}, {"loss": 2.0622, "grad_norm": 0.696494460105896, "learning_rate": 0.0002, "epoch": 0.035413899955732624, "step": 40}, {"loss": 1.9547, "grad_norm": 0.4788398742675781, "learning_rate": 0.0002, "epoch": 0.04426737494466578, "step": 50}, {"loss": 1.8722, "grad_norm": 0.4763128161430359, "learning_rate": 0.0002, "epoch": 0.05312084993359894, "step": 60}, {"loss": 1.8632, "grad_norm": 0.5929698348045349, "learning_rate": 0.0002, "epoch": 0.0619743249225321, "step": 70}, {"loss": 1.9573, "grad_norm": 0.5899396538734436, "learning_rate": 0.0002, "epoch": 0.07082779991146525, "step": 80}, {"loss": 1.8308, "grad_norm": 0.460123747587204, "learning_rate": 0.0002, "epoch": 0.0796812749003984, "step": 90}, {"loss": 1.7615, "grad_norm": 0.4184812009334564, "learning_rate": 0.0002, "epoch": 0.08853474988933156, "step": 100}, {"loss": 1.8079, "grad_norm": 0.4051891267299652, "learning_rate": 0.0002, "epoch": 0.09738822487826472, "step": 110}, {"loss": 1.8911, "grad_norm": 0.3709661066532135, "learning_rate": 0.0002, "epoch": 0.10624169986719788, "step": 120}, {"loss": 1.8695, "grad_norm": 0.4783487915992737, "learning_rate": 0.0002, "epoch": 0.11509517485613104, "step": 130}, {"loss": 1.8602, "grad_norm": 0.36478137969970703, "learning_rate": 0.0002, "epoch": 0.1239486498450642, "step": 140}, {"loss": 1.7814, "grad_norm": 0.4005294442176819, "learning_rate": 0.0002, "epoch": 0.13280212483399734, "step": 150}, {"loss": 1.799, "grad_norm": 0.42357513308525085, "learning_rate": 0.0002, "epoch": 0.1416555998229305, "step": 160}, {"loss": 1.8835, "grad_norm": 0.3913971781730652, "learning_rate": 0.0002, "epoch": 0.15050907481186365, "step": 170}, {"loss": 1.8507, "grad_norm": 0.4650019407272339, "learning_rate": 0.0002, "epoch": 0.1593625498007968, "step": 180}, {"loss": 1.8036, "grad_norm": 0.5545958876609802, "learning_rate": 0.0002, "epoch": 0.16821602478972997, "step": 190}, {"loss": 1.8676, "grad_norm": 0.3669356107711792, "learning_rate": 0.0002, "epoch": 0.17706949977866313, "step": 200}, {"loss": 1.8169, "grad_norm": 0.3683622181415558, "learning_rate": 0.0002, "epoch": 0.18592297476759628, "step": 210}, {"loss": 1.8117, "grad_norm": 0.39825671911239624, "learning_rate": 0.0002, "epoch": 0.19477644975652944, "step": 220}, {"loss": 1.8332, "grad_norm": 0.4298318326473236, "learning_rate": 0.0002, "epoch": 0.2036299247454626, "step": 230}, {"loss": 1.8339, "grad_norm": 0.36111244559288025, "learning_rate": 0.0002, "epoch": 0.21248339973439576, "step": 240}, {"loss": 1.78, "grad_norm": 0.3711858093738556, "learning_rate": 0.0002, "epoch": 0.2213368747233289, "step": 250}, {"loss": 1.8643, "grad_norm": 0.37717559933662415, "learning_rate": 0.0002, "epoch": 0.23019034971226207, "step": 260}, {"loss": 1.7683, "grad_norm": 0.3678877651691437, "learning_rate": 0.0002, "epoch": 0.23904382470119523, "step": 270}, {"loss": 1.8235, "grad_norm": 0.4165912866592407, "learning_rate": 0.0002, "epoch": 0.2478972996901284, "step": 280}, {"loss": 1.8033, "grad_norm": 0.3403240740299225, "learning_rate": 0.0002, "epoch": 0.25675077467906154, "step": 290}, {"loss": 1.8704, "grad_norm": 0.4023234248161316, "learning_rate": 0.0002, "epoch": 0.2656042496679947, "step": 300}, {"loss": 1.7721, "grad_norm": 0.32472360134124756, "learning_rate": 0.0002, "epoch": 0.27445772465692786, "step": 310}, {"loss": 1.8544, "grad_norm": 0.36464595794677734, "learning_rate": 0.0002, "epoch": 0.283311199645861, "step": 320}, {"loss": 1.8168, "grad_norm": 0.3868598937988281, "learning_rate": 0.0002, "epoch": 0.2921646746347942, "step": 330}, {"loss": 1.772, "grad_norm": 0.3123539686203003, "learning_rate": 0.0002, "epoch": 0.3010181496237273, "step": 340}, {"loss": 1.8285, "grad_norm": 0.3392639458179474, "learning_rate": 0.0002, "epoch": 0.3098716246126605, "step": 350}, {"loss": 1.806, "grad_norm": 0.42070651054382324, "learning_rate": 0.0002, "epoch": 0.3187250996015936, "step": 360}, {"loss": 1.8319, "grad_norm": 0.3650900423526764, "learning_rate": 0.0002, "epoch": 0.3275785745905268, "step": 370}, {"loss": 1.8388, "grad_norm": 0.41388973593711853, "learning_rate": 0.0002, "epoch": 0.33643204957945994, "step": 380}, {"loss": 1.79, "grad_norm": 0.36625272035598755, "learning_rate": 0.0002, "epoch": 0.3452855245683931, "step": 390}, {"loss": 1.8271, "grad_norm": 0.3930284082889557, "learning_rate": 0.0002, "epoch": 0.35413899955732625, "step": 400}, {"loss": 1.8664, "grad_norm": 0.3415820300579071, "learning_rate": 0.0002, "epoch": 0.3629924745462594, "step": 410}, {"loss": 1.8885, "grad_norm": 0.4256570041179657, "learning_rate": 0.0002, "epoch": 0.37184594953519257, "step": 420}, {"loss": 1.7728, "grad_norm": 0.3740842938423157, "learning_rate": 0.0002, "epoch": 0.3806994245241257, "step": 430}, {"loss": 1.7676, "grad_norm": 0.334108829498291, "learning_rate": 0.0002, "epoch": 0.3895528995130589, "step": 440}, {"loss": 1.7837, "grad_norm": 0.33186739683151245, "learning_rate": 0.0002, "epoch": 0.398406374501992, "step": 450}, {"loss": 1.8885, "grad_norm": 0.39127954840660095, "learning_rate": 0.0002, "epoch": 0.4072598494909252, "step": 460}, {"loss": 1.8053, "grad_norm": 0.331443727016449, "learning_rate": 0.0002, "epoch": 0.4161133244798583, "step": 470}, {"loss": 1.783, "grad_norm": 0.36834150552749634, "learning_rate": 0.0002, "epoch": 0.4249667994687915, "step": 480}, {"loss": 1.7549, "grad_norm": 0.338123619556427, "learning_rate": 0.0002, "epoch": 0.43382027445772464, "step": 490}, {"loss": 1.795, "grad_norm": 0.3891060948371887, "learning_rate": 0.0002, "epoch": 0.4426737494466578, "step": 500}, {"loss": 1.7639, "grad_norm": 0.3486529290676117, "learning_rate": 0.0002, "epoch": 0.45152722443559096, "step": 510}, {"loss": 1.796, "grad_norm": 0.3635135889053345, "learning_rate": 0.0002, "epoch": 0.46038069942452414, "step": 520}, {"loss": 1.8068, "grad_norm": 0.7706693410873413, "learning_rate": 0.0002, "epoch": 0.4692341744134573, "step": 530}, {"loss": 1.8048, "grad_norm": 0.33725443482398987, "learning_rate": 0.0002, "epoch": 0.47808764940239046, "step": 540}, {"loss": 1.8023, "grad_norm": 0.3127504289150238, "learning_rate": 0.0002, "epoch": 0.4869411243913236, "step": 550}, {"loss": 1.7693, "grad_norm": 0.3527977466583252, "learning_rate": 0.0002, "epoch": 0.4957945993802568, "step": 560}, {"loss": 1.7989, "grad_norm": 0.3574548661708832, "learning_rate": 0.0002, "epoch": 0.5046480743691899, "step": 570}, {"loss": 1.7699, "grad_norm": 0.32787248492240906, "learning_rate": 0.0002, "epoch": 0.5135015493581231, "step": 580}, {"loss": 1.7502, "grad_norm": 0.3309430778026581, "learning_rate": 0.0002, "epoch": 0.5223550243470563, "step": 590}, {"loss": 1.7798, "grad_norm": 0.34276407957077026, "learning_rate": 0.0002, "epoch": 0.5312084993359893, "step": 600}, {"loss": 1.7517, "grad_norm": 0.3343711495399475, "learning_rate": 0.0002, "epoch": 0.5400619743249225, "step": 610}, {"loss": 1.7661, "grad_norm": 0.3193040192127228, "learning_rate": 0.0002, "epoch": 0.5489154493138557, "step": 620}, {"loss": 1.7769, "grad_norm": 0.3059828579425812, "learning_rate": 0.0002, "epoch": 0.5577689243027888, "step": 630}, {"loss": 1.8166, "grad_norm": 0.37237173318862915, "learning_rate": 0.0002, "epoch": 0.566622399291722, "step": 640}, {"loss": 1.7531, "grad_norm": 0.36022549867630005, "learning_rate": 0.0002, "epoch": 0.5754758742806552, "step": 650}, {"loss": 1.771, "grad_norm": 0.34974920749664307, "learning_rate": 0.0002, "epoch": 0.5843293492695883, "step": 660}, {"loss": 1.8226, "grad_norm": 0.37135401368141174, "learning_rate": 0.0002, "epoch": 0.5931828242585214, "step": 670}, {"loss": 1.7456, "grad_norm": 0.3385699689388275, "learning_rate": 0.0002, "epoch": 0.6020362992474546, "step": 680}, {"loss": 1.7696, "grad_norm": 0.36015814542770386, "learning_rate": 0.0002, "epoch": 0.6108897742363878, "step": 690}, {"loss": 1.7892, "grad_norm": 0.3503795564174652, "learning_rate": 0.0002, "epoch": 0.619743249225321, "step": 700}, {"loss": 1.7733, "grad_norm": 0.3447190225124359, "learning_rate": 0.0002, "epoch": 0.628596724214254, "step": 710}, {"loss": 1.794, "grad_norm": 0.3193499445915222, "learning_rate": 0.0002, "epoch": 0.6374501992031872, "step": 720}, {"loss": 1.8046, "grad_norm": 0.37058180570602417, "learning_rate": 0.0002, "epoch": 0.6463036741921204, "step": 730}, {"loss": 1.8391, "grad_norm": 0.42216411232948303, "learning_rate": 0.0002, "epoch": 0.6551571491810536, "step": 740}, {"loss": 1.7142, "grad_norm": 0.3091185688972473, "learning_rate": 0.0002, "epoch": 0.6640106241699867, "step": 750}, {"loss": 1.8624, "grad_norm": 0.33168601989746094, "learning_rate": 0.0002, "epoch": 0.6728640991589199, "step": 760}, {"loss": 1.7123, "grad_norm": 0.31269341707229614, "learning_rate": 0.0002, "epoch": 0.6817175741478531, "step": 770}, {"loss": 1.8526, "grad_norm": 0.36125293374061584, "learning_rate": 0.0002, "epoch": 0.6905710491367862, "step": 780}, {"loss": 1.7478, "grad_norm": 0.3145293593406677, "learning_rate": 0.0002, "epoch": 0.6994245241257193, "step": 790}, {"loss": 1.6545, "grad_norm": 0.3611990809440613, "learning_rate": 0.0002, "epoch": 0.7082779991146525, "step": 800}, {"loss": 1.892, "grad_norm": 0.3165971636772156, "learning_rate": 0.0002, "epoch": 0.7171314741035857, "step": 810}, {"loss": 1.8251, "grad_norm": 0.3364323675632477, "learning_rate": 0.0002, "epoch": 0.7259849490925188, "step": 820}, {"loss": 1.8508, "grad_norm": 0.4310600757598877, "learning_rate": 0.0002, "epoch": 0.734838424081452, "step": 830}, {"loss": 1.7816, "grad_norm": 0.3414389491081238, "learning_rate": 0.0002, "epoch": 0.7436918990703851, "step": 840}, {"loss": 1.8148, "grad_norm": 0.35536202788352966, "learning_rate": 0.0002, "epoch": 0.7525453740593183, "step": 850}, {"loss": 1.8241, "grad_norm": 0.3232460618019104, "learning_rate": 0.0002, "epoch": 0.7613988490482514, "step": 860}, {"loss": 1.7312, "grad_norm": 0.32734858989715576, "learning_rate": 0.0002, "epoch": 0.7702523240371846, "step": 870}, {"loss": 1.7241, "grad_norm": 0.3433493673801422, "learning_rate": 0.0002, "epoch": 0.7791057990261178, "step": 880}, {"loss": 1.7375, "grad_norm": 0.33354780077934265, "learning_rate": 0.0002, "epoch": 0.787959274015051, "step": 890}, {"loss": 1.7314, "grad_norm": 0.30728545784950256, "learning_rate": 0.0002, "epoch": 0.796812749003984, "step": 900}, {"loss": 1.8267, "grad_norm": 0.3373030126094818, "learning_rate": 0.0002, "epoch": 0.8056662239929172, "step": 910}, {"loss": 1.8479, "grad_norm": 0.3468782603740692, "learning_rate": 0.0002, "epoch": 0.8145196989818504, "step": 920}, {"loss": 1.8548, "grad_norm": 0.33520200848579407, "learning_rate": 0.0002, "epoch": 0.8233731739707836, "step": 930}, {"loss": 1.7932, "grad_norm": 0.35207098722457886, "learning_rate": 0.0002, "epoch": 0.8322266489597167, "step": 940}, {"loss": 1.7804, "grad_norm": 0.4000207483768463, "learning_rate": 0.0002, "epoch": 0.8410801239486498, "step": 950}, {"loss": 1.7996, "grad_norm": 0.35362836718559265, "learning_rate": 0.0002, "epoch": 0.849933598937583, "step": 960}, {"loss": 1.7497, "grad_norm": 0.3470745086669922, "learning_rate": 0.0002, "epoch": 0.8587870739265162, "step": 970}, {"loss": 1.8174, "grad_norm": 0.31602704524993896, "learning_rate": 0.0002, "epoch": 0.8676405489154493, "step": 980}, {"loss": 1.7734, "grad_norm": 0.3062942326068878, "learning_rate": 0.0002, "epoch": 0.8764940239043825, "step": 990}, {"loss": 1.7804, "grad_norm": 0.36963850259780884, "learning_rate": 0.0002, "epoch": 0.8853474988933157, "step": 1000}, {"loss": 1.7309, "grad_norm": 0.3384034037590027, "learning_rate": 0.0002, "epoch": 0.8942009738822487, "step": 1010}, {"loss": 1.7945, "grad_norm": 0.30436110496520996, "learning_rate": 0.0002, "epoch": 0.9030544488711819, "step": 1020}, {"loss": 1.7126, "grad_norm": 3.499784469604492, "learning_rate": 0.0002, "epoch": 0.9119079238601151, "step": 1030}, {"loss": 1.7847, "grad_norm": 0.3130280375480652, "learning_rate": 0.0002, "epoch": 0.9207613988490483, "step": 1040}, {"loss": 1.7527, "grad_norm": 0.29976674914360046, "learning_rate": 0.0002, "epoch": 0.9296148738379814, "step": 1050}, {"loss": 1.7753, "grad_norm": 0.35852617025375366, "learning_rate": 0.0002, "epoch": 0.9384683488269145, "step": 1060}, {"loss": 1.7507, "grad_norm": 0.3288591504096985, "learning_rate": 0.0002, "epoch": 0.9473218238158477, "step": 1070}, {"loss": 1.8155, "grad_norm": 0.32641634345054626, "learning_rate": 0.0002, "epoch": 0.9561752988047809, "step": 1080}, {"loss": 1.7912, "grad_norm": 0.3305715322494507, "learning_rate": 0.0002, "epoch": 0.965028773793714, "step": 1090}, {"loss": 1.8368, "grad_norm": 0.30650773644447327, "learning_rate": 0.0002, "epoch": 0.9738822487826472, "step": 1100}, {"loss": 1.6739, "grad_norm": 0.3330624997615814, "learning_rate": 0.0002, "epoch": 0.9827357237715804, "step": 1110}, {"loss": 1.8392, "grad_norm": 0.3173314034938812, "learning_rate": 0.0002, "epoch": 0.9915891987605135, "step": 1120}, {"eval_loss": 1.8095673322677612, "eval_runtime": 82.6312, "eval_samples_per_second": 6.233, "eval_steps_per_second": 0.787, "epoch": 0.9995573262505534, "step": 1129}, {"loss": 1.7997, "grad_norm": 0.3092995882034302, "learning_rate": 0.0002, "epoch": 1.0004426737494467, "step": 1130}, {"loss": 1.6958, "grad_norm": 0.34386494755744934, "learning_rate": 0.0002, "epoch": 1.0092961487383798, "step": 1140}, {"loss": 1.7149, "grad_norm": 0.2887897789478302, "learning_rate": 0.0002, "epoch": 1.0181496237273129, "step": 1150}, {"loss": 1.7377, "grad_norm": 0.3706893026828766, "learning_rate": 0.0002, "epoch": 1.0270030987162462, "step": 1160}, {"loss": 1.6604, "grad_norm": 0.34724316000938416, "learning_rate": 0.0002, "epoch": 1.0358565737051793, "step": 1170}, {"loss": 1.7749, "grad_norm": 0.41001757979393005, "learning_rate": 0.0002, "epoch": 1.0447100486941125, "step": 1180}, {"loss": 1.6332, "grad_norm": 0.34838348627090454, "learning_rate": 0.0002, "epoch": 1.0535635236830456, "step": 1190}, {"loss": 1.7416, "grad_norm": 0.37201181054115295, "learning_rate": 0.0002, "epoch": 1.0624169986719787, "step": 1200}, {"loss": 1.7707, "grad_norm": 0.36871352791786194, "learning_rate": 0.0002, "epoch": 1.071270473660912, "step": 1210}, {"loss": 1.6769, "grad_norm": 0.35687458515167236, "learning_rate": 0.0002, "epoch": 1.080123948649845, "step": 1220}, {"loss": 1.7235, "grad_norm": 0.3864741921424866, "learning_rate": 0.0002, "epoch": 1.0889774236387781, "step": 1230}, {"loss": 1.729, "grad_norm": 0.3496808707714081, "learning_rate": 0.0002, "epoch": 1.0978308986277114, "step": 1240}, {"loss": 1.7192, "grad_norm": 0.3444930911064148, "learning_rate": 0.0002, "epoch": 1.1066843736166445, "step": 1250}, {"loss": 1.6672, "grad_norm": 0.353188693523407, "learning_rate": 0.0002, "epoch": 1.1155378486055776, "step": 1260}, {"loss": 1.7634, "grad_norm": 0.3284400999546051, "learning_rate": 0.0002, "epoch": 1.1243913235945109, "step": 1270}, {"loss": 1.7441, "grad_norm": 0.3545348644256592, "learning_rate": 0.0002, "epoch": 1.133244798583444, "step": 1280}, {"loss": 1.7343, "grad_norm": 0.3489900529384613, "learning_rate": 0.0002, "epoch": 1.1420982735723773, "step": 1290}, {"loss": 1.6399, "grad_norm": 0.40355560183525085, "learning_rate": 0.0002, "epoch": 1.1509517485613103, "step": 1300}, {"loss": 1.7658, "grad_norm": 0.3369944095611572, "learning_rate": 0.0002, "epoch": 1.1598052235502434, "step": 1310}, {"loss": 1.7098, "grad_norm": 0.39141345024108887, "learning_rate": 0.0002, "epoch": 1.1686586985391767, "step": 1320}, {"loss": 1.6628, "grad_norm": 0.36518552899360657, "learning_rate": 0.0002, "epoch": 1.1775121735281098, "step": 1330}, {"loss": 1.6958, "grad_norm": 0.3730056583881378, "learning_rate": 0.0002, "epoch": 1.1863656485170428, "step": 1340}, {"loss": 1.7613, "grad_norm": 0.37711501121520996, "learning_rate": 0.0002, "epoch": 1.1952191235059761, "step": 1350}, {"loss": 1.6423, "grad_norm": 0.3627128005027771, "learning_rate": 0.0002, "epoch": 1.2040725984949092, "step": 1360}, {"loss": 1.7214, "grad_norm": 0.3458651006221771, "learning_rate": 0.0002, "epoch": 1.2129260734838425, "step": 1370}, {"loss": 1.6978, "grad_norm": 0.392395555973053, "learning_rate": 0.0002, "epoch": 1.2217795484727756, "step": 1380}, {"loss": 1.7785, "grad_norm": 0.3353286683559418, "learning_rate": 0.0002, "epoch": 1.2306330234617087, "step": 1390}, {"loss": 1.7019, "grad_norm": 0.9545007944107056, "learning_rate": 0.0002, "epoch": 1.239486498450642, "step": 1400}, {"loss": 1.725, "grad_norm": 0.37037935853004456, "learning_rate": 0.0002, "epoch": 1.248339973439575, "step": 1410}, {"loss": 1.6818, "grad_norm": 0.3831497132778168, "learning_rate": 0.0002, "epoch": 1.257193448428508, "step": 1420}, {"loss": 1.747, "grad_norm": 0.4633576273918152, "learning_rate": 0.0002, "epoch": 1.2660469234174414, "step": 1430}, {"loss": 1.6864, "grad_norm": 0.3690567910671234, "learning_rate": 0.0002, "epoch": 1.2749003984063745, "step": 1440}, {"loss": 1.767, "grad_norm": 0.33980098366737366, "learning_rate": 0.0002, "epoch": 1.2837538733953076, "step": 1450}, {"loss": 1.6989, "grad_norm": 0.3731277287006378, "learning_rate": 0.0002, "epoch": 1.2926073483842409, "step": 1460}, {"loss": 1.6801, "grad_norm": 0.3781551122665405, "learning_rate": 0.0002, "epoch": 1.301460823373174, "step": 1470}, {"loss": 1.7551, "grad_norm": 0.36511561274528503, "learning_rate": 0.0002, "epoch": 1.310314298362107, "step": 1480}, {"loss": 1.6629, "grad_norm": 0.3292245864868164, "learning_rate": 0.0002, "epoch": 1.3191677733510403, "step": 1490}, {"loss": 1.7098, "grad_norm": 0.38758566975593567, "learning_rate": 0.0002, "epoch": 1.3280212483399734, "step": 1500}, {"loss": 1.7364, "grad_norm": 0.3993414044380188, "learning_rate": 0.0002, "epoch": 1.3368747233289067, "step": 1510}, {"loss": 1.7202, "grad_norm": 0.35689303278923035, "learning_rate": 0.0002, "epoch": 1.3457281983178397, "step": 1520}, {"loss": 1.7082, "grad_norm": 0.41849321126937866, "learning_rate": 0.0002, "epoch": 1.354581673306773, "step": 1530}, {"loss": 1.7488, "grad_norm": 0.36752554774284363, "learning_rate": 0.0002, "epoch": 1.3634351482957061, "step": 1540}, {"loss": 1.7032, "grad_norm": 0.36915940046310425, "learning_rate": 0.0002, "epoch": 1.3722886232846392, "step": 1550}, {"loss": 1.6698, "grad_norm": 0.3656710386276245, "learning_rate": 0.0002, "epoch": 1.3811420982735725, "step": 1560}, {"loss": 1.7269, "grad_norm": 0.32055532932281494, "learning_rate": 0.0002, "epoch": 1.3899955732625056, "step": 1570}, {"loss": 1.8, "grad_norm": 0.35031241178512573, "learning_rate": 0.0002, "epoch": 1.3988490482514386, "step": 1580}, {"loss": 1.6667, "grad_norm": 0.44541189074516296, "learning_rate": 0.0002, "epoch": 1.407702523240372, "step": 1590}, {"loss": 1.8624, "grad_norm": 0.36922356486320496, "learning_rate": 0.0002, "epoch": 1.416555998229305, "step": 1600}, {"loss": 1.7011, "grad_norm": 0.3470565974712372, "learning_rate": 0.0002, "epoch": 1.425409473218238, "step": 1610}, {"loss": 1.6912, "grad_norm": 0.3743111193180084, "learning_rate": 0.0002, "epoch": 1.4342629482071714, "step": 1620}, {"loss": 1.752, "grad_norm": 0.3619250953197479, "learning_rate": 0.0002, "epoch": 1.4431164231961044, "step": 1630}, {"loss": 1.6919, "grad_norm": 0.4028145968914032, "learning_rate": 0.0002, "epoch": 1.4519698981850375, "step": 1640}, {"loss": 1.75, "grad_norm": 0.36065351963043213, "learning_rate": 0.0002, "epoch": 1.4608233731739708, "step": 1650}, {"loss": 1.8212, "grad_norm": 0.44304442405700684, "learning_rate": 0.0002, "epoch": 1.469676848162904, "step": 1660}, {"loss": 1.6691, "grad_norm": 0.35770007967948914, "learning_rate": 0.0002, "epoch": 1.478530323151837, "step": 1670}, {"loss": 1.7588, "grad_norm": 0.37584400177001953, "learning_rate": 0.0002, "epoch": 1.4873837981407703, "step": 1680}, {"loss": 1.63, "grad_norm": 0.37151241302490234, "learning_rate": 0.0002, "epoch": 1.4962372731297033, "step": 1690}, {"loss": 1.6675, "grad_norm": 0.36422812938690186, "learning_rate": 0.0002, "epoch": 1.5050907481186364, "step": 1700}, {"loss": 1.7045, "grad_norm": 0.3680015206336975, "learning_rate": 0.0002, "epoch": 1.5139442231075697, "step": 1710}, {"loss": 1.6917, "grad_norm": 0.3356926441192627, "learning_rate": 0.0002, "epoch": 1.522797698096503, "step": 1720}, {"loss": 1.7108, "grad_norm": 0.37887054681777954, "learning_rate": 0.0002, "epoch": 1.531651173085436, "step": 1730}, {"loss": 1.7001, "grad_norm": 0.37052762508392334, "learning_rate": 0.0002, "epoch": 1.5405046480743692, "step": 1740}, {"loss": 1.6677, "grad_norm": 0.333925724029541, "learning_rate": 0.0002, "epoch": 1.5493581230633025, "step": 1750}, {"loss": 1.7159, "grad_norm": 0.3722778558731079, "learning_rate": 0.0002, "epoch": 1.5582115980522355, "step": 1760}, {"loss": 1.6923, "grad_norm": 0.3331141173839569, "learning_rate": 0.0002, "epoch": 1.5670650730411686, "step": 1770}, {"loss": 1.7444, "grad_norm": 0.3670045733451843, "learning_rate": 0.0002, "epoch": 1.575918548030102, "step": 1780}, {"loss": 1.7092, "grad_norm": 0.3769885301589966, "learning_rate": 0.0002, "epoch": 1.584772023019035, "step": 1790}, {"loss": 1.6689, "grad_norm": 0.4266890287399292, "learning_rate": 0.0002, "epoch": 1.593625498007968, "step": 1800}, {"loss": 1.6859, "grad_norm": 0.37174347043037415, "learning_rate": 0.0002, "epoch": 1.6024789729969013, "step": 1810}, {"loss": 1.6793, "grad_norm": 0.3599846363067627, "learning_rate": 0.0002, "epoch": 1.6113324479858344, "step": 1820}, {"loss": 1.6836, "grad_norm": 0.3364820182323456, "learning_rate": 0.0002, "epoch": 1.6201859229747675, "step": 1830}, {"loss": 1.7278, "grad_norm": 0.3874799907207489, "learning_rate": 0.0002, "epoch": 1.6290393979637008, "step": 1840}, {"loss": 1.705, "grad_norm": 0.3706085681915283, "learning_rate": 0.0002, "epoch": 1.6378928729526339, "step": 1850}, {"loss": 1.6761, "grad_norm": 0.3997809886932373, "learning_rate": 0.0002, "epoch": 1.646746347941567, "step": 1860}, {"loss": 1.7983, "grad_norm": 0.4033166170120239, "learning_rate": 0.0002, "epoch": 1.6555998229305002, "step": 1870}, {"loss": 1.6518, "grad_norm": 0.3944370150566101, "learning_rate": 0.0002, "epoch": 1.6644532979194335, "step": 1880}, {"loss": 1.6017, "grad_norm": 0.3467825651168823, "learning_rate": 0.0002, "epoch": 1.6733067729083664, "step": 1890}, {"loss": 1.7462, "grad_norm": 0.35290950536727905, "learning_rate": 0.0002, "epoch": 1.6821602478972997, "step": 1900}, {"loss": 1.7634, "grad_norm": 0.3664521872997284, "learning_rate": 0.0002, "epoch": 1.691013722886233, "step": 1910}, {"loss": 1.7922, "grad_norm": 0.33863595128059387, "learning_rate": 0.0002, "epoch": 1.699867197875166, "step": 1920}, {"loss": 1.7048, "grad_norm": 0.34726113080978394, "learning_rate": 0.0002, "epoch": 1.7087206728640991, "step": 1930}, {"loss": 1.6664, "grad_norm": 0.35060688853263855, "learning_rate": 0.0002, "epoch": 1.7175741478530324, "step": 1940}, {"loss": 1.7577, "grad_norm": 0.33741647005081177, "learning_rate": 0.0002, "epoch": 1.7264276228419655, "step": 1950}, {"loss": 1.6971, "grad_norm": 0.36190304160118103, "learning_rate": 0.0002, "epoch": 1.7352810978308986, "step": 1960}, {"loss": 1.7238, "grad_norm": 0.3412845730781555, "learning_rate": 0.0002, "epoch": 1.7441345728198319, "step": 1970}, {"loss": 1.7038, "grad_norm": 0.3841935694217682, "learning_rate": 0.0002, "epoch": 1.752988047808765, "step": 1980}, {"loss": 1.7185, "grad_norm": 0.39062076807022095, "learning_rate": 0.0002, "epoch": 1.761841522797698, "step": 1990}, {"loss": 1.7346, "grad_norm": 0.3741697669029236, "learning_rate": 0.0002, "epoch": 1.7706949977866313, "step": 2000}, {"loss": 1.6864, "grad_norm": 0.4160231053829193, "learning_rate": 0.0002, "epoch": 1.7795484727755644, "step": 2010}, {"loss": 1.7572, "grad_norm": 0.3602111339569092, "learning_rate": 0.0002, "epoch": 1.7884019477644975, "step": 2020}, {"loss": 1.6139, "grad_norm": 0.36740878224372864, "learning_rate": 0.0002, "epoch": 1.7972554227534308, "step": 2030}, {"loss": 1.7043, "grad_norm": 0.419039249420166, "learning_rate": 0.0002, "epoch": 1.8061088977423638, "step": 2040}, {"loss": 1.7847, "grad_norm": 0.3511838912963867, "learning_rate": 0.0002, "epoch": 1.814962372731297, "step": 2050}, {"loss": 1.6477, "grad_norm": 0.3580166697502136, "learning_rate": 0.0002, "epoch": 1.8238158477202302, "step": 2060}, {"loss": 1.7562, "grad_norm": 0.40928223729133606, "learning_rate": 0.0002, "epoch": 1.8326693227091635, "step": 2070}, {"loss": 1.7356, "grad_norm": 0.37134310603141785, "learning_rate": 0.0002, "epoch": 1.8415227976980963, "step": 2080}, {"loss": 1.6829, "grad_norm": 0.3924112319946289, "learning_rate": 0.0002, "epoch": 1.8503762726870296, "step": 2090}, {"loss": 1.6785, "grad_norm": 0.3215042054653168, "learning_rate": 0.0002, "epoch": 1.859229747675963, "step": 2100}, {"loss": 1.6864, "grad_norm": 0.37674015760421753, "learning_rate": 0.0002, "epoch": 1.868083222664896, "step": 2110}, {"loss": 1.7313, "grad_norm": 0.370856374502182, "learning_rate": 0.0002, "epoch": 1.876936697653829, "step": 2120}, {"loss": 1.7163, "grad_norm": 0.35783782601356506, "learning_rate": 0.0002, "epoch": 1.8857901726427624, "step": 2130}, {"loss": 1.7655, "grad_norm": 0.39538058638572693, "learning_rate": 0.0002, "epoch": 1.8946436476316955, "step": 2140}, {"loss": 1.6614, "grad_norm": 0.36677780747413635, "learning_rate": 0.0002, "epoch": 1.9034971226206285, "step": 2150}, {"loss": 1.6959, "grad_norm": 0.39032700657844543, "learning_rate": 0.0002, "epoch": 1.9123505976095618, "step": 2160}, {"loss": 1.7643, "grad_norm": 0.39762043952941895, "learning_rate": 0.0002, "epoch": 1.921204072598495, "step": 2170}, {"loss": 1.6767, "grad_norm": 0.5400257110595703, "learning_rate": 0.0002, "epoch": 1.930057547587428, "step": 2180}, {"loss": 1.7262, "grad_norm": 0.3650212287902832, "learning_rate": 0.0002, "epoch": 1.9389110225763613, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3583165109157562, "learning_rate": 0.0002, "epoch": 1.9477644975652944, "step": 2200}, {"loss": 1.7241, "grad_norm": 0.4031282365322113, "learning_rate": 0.0002, "epoch": 1.9566179725542274, "step": 2210}, {"loss": 1.7617, "grad_norm": 0.3673221170902252, "learning_rate": 0.0002, "epoch": 1.9654714475431607, "step": 2220}, {"loss": 1.6862, "grad_norm": 0.3920327126979828, "learning_rate": 0.0002, "epoch": 1.9743249225320938, "step": 2230}, {"loss": 1.7192, "grad_norm": 0.4765491783618927, "learning_rate": 0.0002, "epoch": 1.9831783975210269, "step": 2240}, {"loss": 1.7759, "grad_norm": 0.38130584359169006, "learning_rate": 0.0002, "epoch": 1.9920318725099602, "step": 2250}, {"eval_loss": 1.8077166080474854, "eval_runtime": 82.8351, "eval_samples_per_second": 6.217, "eval_steps_per_second": 0.785, "epoch": 2.0, "step": 2259}, {"loss": 1.7081, "grad_norm": 0.34340235590934753, "learning_rate": 0.0002, "epoch": 2.0008853474988935, "step": 2260}, {"loss": 1.6815, "grad_norm": 0.3710762858390808, "learning_rate": 0.0002, "epoch": 2.0097388224878263, "step": 2270}, {"loss": 1.5828, "grad_norm": 0.35640114545822144, "learning_rate": 0.0002, "epoch": 2.0185922974767596, "step": 2280}, {"loss": 1.6322, "grad_norm": 0.45970189571380615, "learning_rate": 0.0002, "epoch": 2.027445772465693, "step": 2290}, {"loss": 1.5598, "grad_norm": 0.4256797134876251, "learning_rate": 0.0002, "epoch": 2.0362992474546258, "step": 2300}, {"loss": 1.6271, "grad_norm": 0.42421531677246094, "learning_rate": 0.0002, "epoch": 2.045152722443559, "step": 2310}, {"loss": 1.6117, "grad_norm": 0.4032478928565979, "learning_rate": 0.0002, "epoch": 2.0540061974324924, "step": 2320}, {"loss": 1.6389, "grad_norm": 0.4073623716831207, "learning_rate": 0.0002, "epoch": 2.062859672421425, "step": 2330}, {"loss": 1.6527, "grad_norm": 0.4845200777053833, "learning_rate": 0.0002, "epoch": 2.0717131474103585, "step": 2340}, {"loss": 1.5734, "grad_norm": 0.40578293800354004, "learning_rate": 0.0002, "epoch": 2.080566622399292, "step": 2350}, {"loss": 1.5853, "grad_norm": 0.4037284255027771, "learning_rate": 0.0002, "epoch": 2.089420097388225, "step": 2360}, {"loss": 1.6511, "grad_norm": 0.4717613160610199, "learning_rate": 0.0002, "epoch": 2.098273572377158, "step": 2370}, {"loss": 1.6273, "grad_norm": 0.42076411843299866, "learning_rate": 0.0002, "epoch": 2.1071270473660912, "step": 2380}, {"loss": 1.654, "grad_norm": 0.47799113392829895, "learning_rate": 0.0002, "epoch": 2.1159805223550245, "step": 2390}, {"loss": 1.5528, "grad_norm": 0.4253084063529968, "learning_rate": 0.0002, "epoch": 2.1248339973439574, "step": 2400}, {"loss": 1.6432, "grad_norm": 0.5023085474967957, "learning_rate": 0.0002, "epoch": 2.1336874723328907, "step": 2410}, {"loss": 1.5926, "grad_norm": 0.49162712693214417, "learning_rate": 0.0002, "epoch": 2.142540947321824, "step": 2420}, {"loss": 1.5779, "grad_norm": 0.39035019278526306, "learning_rate": 0.0002, "epoch": 2.151394422310757, "step": 2430}, {"loss": 1.7526, "grad_norm": 0.43223854899406433, "learning_rate": 0.0002, "epoch": 2.16024789729969, "step": 2440}, {"loss": 1.6334, "grad_norm": 0.4596616327762604, "learning_rate": 0.0002, "epoch": 2.1691013722886234, "step": 2450}, {"loss": 1.6067, "grad_norm": 0.4469447731971741, "learning_rate": 0.0002, "epoch": 2.1779548472775563, "step": 2460}, {"loss": 1.5806, "grad_norm": 0.5100595355033875, "learning_rate": 0.0002, "epoch": 2.1868083222664896, "step": 2470}, {"loss": 1.6456, "grad_norm": 0.4169430732727051, "learning_rate": 0.0002, "epoch": 2.195661797255423, "step": 2480}, {"loss": 1.6734, "grad_norm": 0.4699254035949707, "learning_rate": 0.0002, "epoch": 2.2045152722443557, "step": 2490}, {"loss": 1.6259, "grad_norm": 0.43524250388145447, "learning_rate": 0.0002, "epoch": 2.213368747233289, "step": 2500}, {"loss": 1.6717, "grad_norm": 0.4496648907661438, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2510}, {"loss": 1.6735, "grad_norm": 0.43408212065696716, "learning_rate": 0.0002, "epoch": 2.231075697211155, "step": 2520}, {"loss": 1.611, "grad_norm": 0.4596034288406372, "learning_rate": 0.0002, "epoch": 2.2399291722000885, "step": 2530}, {"loss": 1.6271, "grad_norm": 0.5217021107673645, "learning_rate": 0.0002, "epoch": 2.2487826471890218, "step": 2540}, {"loss": 1.6027, "grad_norm": 0.44745638966560364, "learning_rate": 0.0002, "epoch": 2.2576361221779546, "step": 2550}, {"loss": 1.675, "grad_norm": 0.4484798014163971, "learning_rate": 0.0002, "epoch": 2.266489597166888, "step": 2560}, {"loss": 1.5321, "grad_norm": 0.4428067207336426, "learning_rate": 0.0002, "epoch": 2.275343072155821, "step": 2570}, {"loss": 1.6716, "grad_norm": 0.5095171332359314, "learning_rate": 0.0002, "epoch": 2.2841965471447545, "step": 2580}, {"loss": 1.5661, "grad_norm": 0.44833096861839294, "learning_rate": 0.0002, "epoch": 2.2930500221336874, "step": 2590}, {"loss": 1.652, "grad_norm": 0.507905900478363, "learning_rate": 0.0002, "epoch": 2.3019034971226207, "step": 2600}, {"loss": 1.5963, "grad_norm": 0.40808171033859253, "learning_rate": 0.0002, "epoch": 2.310756972111554, "step": 2610}, {"loss": 1.6574, "grad_norm": 0.4684814214706421, "learning_rate": 0.0002, "epoch": 2.319610447100487, "step": 2620}, {"loss": 1.587, "grad_norm": 0.44864922761917114, "learning_rate": 0.0002, "epoch": 2.32846392208942, "step": 2630}, {"loss": 1.5828, "grad_norm": 0.4174162745475769, "learning_rate": 0.0002, "epoch": 2.3373173970783534, "step": 2640}, {"loss": 1.642, "grad_norm": 0.42314743995666504, "learning_rate": 0.0002, "epoch": 2.3461708720672863, "step": 2650}, {"loss": 1.5884, "grad_norm": 0.49224185943603516, "learning_rate": 0.0002, "epoch": 2.3550243470562195, "step": 2660}, {"loss": 1.5766, "grad_norm": 0.45190292596817017, "learning_rate": 0.0002, "epoch": 2.363877822045153, "step": 2670}, {"loss": 1.6284, "grad_norm": 0.41817107796669006, "learning_rate": 0.0002, "epoch": 2.3727312970340857, "step": 2680}, {"loss": 1.6356, "grad_norm": 0.6436763405799866, "learning_rate": 0.0002, "epoch": 2.381584772023019, "step": 2690}, {"loss": 1.5915, "grad_norm": 0.47175949811935425, "learning_rate": 0.0002, "epoch": 2.3904382470119523, "step": 2700}, {"loss": 1.6303, "grad_norm": 0.480339378118515, "learning_rate": 0.0002, "epoch": 2.3992917220008856, "step": 2710}, {"loss": 1.5697, "grad_norm": 0.4723486006259918, "learning_rate": 0.0002, "epoch": 2.4081451969898184, "step": 2720}, {"loss": 1.54, "grad_norm": 0.4305492043495178, "learning_rate": 0.0002, "epoch": 2.4169986719787517, "step": 2730}, {"loss": 1.71, "grad_norm": 0.5007492303848267, "learning_rate": 0.0002, "epoch": 2.425852146967685, "step": 2740}, {"loss": 1.5369, "grad_norm": 0.5374062061309814, "learning_rate": 0.0002, "epoch": 2.434705621956618, "step": 2750}, {"loss": 1.6156, "grad_norm": 0.45866212248802185, "learning_rate": 0.0002, "epoch": 2.443559096945551, "step": 2760}, {"loss": 1.6066, "grad_norm": 0.47914502024650574, "learning_rate": 0.0002, "epoch": 2.4524125719344845, "step": 2770}, {"loss": 1.5644, "grad_norm": 0.43804746866226196, "learning_rate": 0.0002, "epoch": 2.4612660469234173, "step": 2780}, {"loss": 1.5952, "grad_norm": 0.43656906485557556, "learning_rate": 0.0002, "epoch": 2.4701195219123506, "step": 2790}, {"loss": 1.6311, "grad_norm": 0.4820363521575928, "learning_rate": 0.0002, "epoch": 2.478972996901284, "step": 2800}, {"loss": 1.5375, "grad_norm": 0.4916800558567047, "learning_rate": 0.0002, "epoch": 2.4878264718902168, "step": 2810}, {"loss": 1.5736, "grad_norm": 0.4521256983280182, "learning_rate": 0.0002, "epoch": 2.49667994687915, "step": 2820}, {"loss": 1.6179, "grad_norm": 0.5066806674003601, "learning_rate": 0.0002, "epoch": 2.5055334218680834, "step": 2830}, {"loss": 1.5812, "grad_norm": 0.4768151640892029, "learning_rate": 0.0002, "epoch": 2.514386896857016, "step": 2840}, {"loss": 1.6719, "grad_norm": 0.5144683718681335, "learning_rate": 0.0002, "epoch": 2.5232403718459495, "step": 2850}, {"loss": 1.6063, "grad_norm": 0.4718942940235138, "learning_rate": 0.0002, "epoch": 2.532093846834883, "step": 2860}, {"loss": 1.6099, "grad_norm": 0.4924587309360504, "learning_rate": 0.0002, "epoch": 2.5409473218238157, "step": 2870}, {"loss": 1.5994, "grad_norm": 0.4649953842163086, "learning_rate": 0.0002, "epoch": 2.549800796812749, "step": 2880}, {"loss": 1.6501, "grad_norm": 0.4836665987968445, "learning_rate": 0.0002, "epoch": 2.5586542718016823, "step": 2890}, {"loss": 1.6518, "grad_norm": 0.4162124991416931, "learning_rate": 0.0002, "epoch": 2.567507746790615, "step": 2900}, {"loss": 1.6471, "grad_norm": 0.4894537925720215, "learning_rate": 0.0002, "epoch": 2.5763612217795484, "step": 2910}, {"loss": 1.6123, "grad_norm": 0.4539397358894348, "learning_rate": 0.0002, "epoch": 2.5852146967684817, "step": 2920}, {"loss": 1.6449, "grad_norm": 0.4718773066997528, "learning_rate": 0.0002, "epoch": 2.5940681717574146, "step": 2930}, {"loss": 1.584, "grad_norm": 0.49989837408065796, "learning_rate": 0.0002, "epoch": 2.602921646746348, "step": 2940}, {"loss": 1.6087, "grad_norm": 0.4862406849861145, "learning_rate": 0.0002, "epoch": 2.611775121735281, "step": 2950}, {"loss": 1.6057, "grad_norm": 0.4244804382324219, "learning_rate": 0.0002, "epoch": 2.620628596724214, "step": 2960}, {"loss": 1.7795, "grad_norm": 0.49304354190826416, "learning_rate": 0.0002, "epoch": 2.6294820717131473, "step": 2970}, {"loss": 1.7255, "grad_norm": 0.4818236529827118, "learning_rate": 0.0002, "epoch": 2.6383355467020806, "step": 2980}, {"loss": 1.621, "grad_norm": 0.5077425837516785, "learning_rate": 0.0002, "epoch": 2.647189021691014, "step": 2990}, {"loss": 1.7064, "grad_norm": 0.4494157135486603, "learning_rate": 0.0002, "epoch": 2.6560424966799467, "step": 3000}, {"loss": 1.6792, "grad_norm": 0.4790278971195221, "learning_rate": 0.0002, "epoch": 2.66489597166888, "step": 3010}, {"loss": 1.6082, "grad_norm": 0.4702624976634979, "learning_rate": 0.0002, "epoch": 2.6737494466578133, "step": 3020}, {"loss": 1.6494, "grad_norm": 0.5082133412361145, "learning_rate": 0.0002, "epoch": 2.682602921646746, "step": 3030}, {"loss": 1.6438, "grad_norm": 0.4553256630897522, "learning_rate": 0.0002, "epoch": 2.6914563966356795, "step": 3040}, {"loss": 1.6155, "grad_norm": 0.4492715001106262, "learning_rate": 0.0002, "epoch": 2.700309871624613, "step": 3050}, {"loss": 1.5367, "grad_norm": 0.4555944502353668, "learning_rate": 0.0002, "epoch": 2.709163346613546, "step": 3060}, {"loss": 1.5793, "grad_norm": 0.5879693031311035, "learning_rate": 0.0002, "epoch": 2.718016821602479, "step": 3070}, {"loss": 1.6357, "grad_norm": 0.4628562927246094, "learning_rate": 0.0002, "epoch": 2.7268702965914122, "step": 3080}, {"loss": 1.6585, "grad_norm": 0.5169575810432434, "learning_rate": 0.0002, "epoch": 2.7357237715803455, "step": 3090}, {"loss": 1.562, "grad_norm": 0.4630090892314911, "learning_rate": 0.0002, "epoch": 2.7445772465692784, "step": 3100}, {"loss": 1.5508, "grad_norm": 0.5437219738960266, "learning_rate": 0.0002, "epoch": 2.7534307215582117, "step": 3110}, {"loss": 1.6442, "grad_norm": 0.5102152228355408, "learning_rate": 0.0002, "epoch": 2.762284196547145, "step": 3120}, {"loss": 1.5448, "grad_norm": 0.48287826776504517, "learning_rate": 0.0002, "epoch": 2.771137671536078, "step": 3130}, {"loss": 1.6657, "grad_norm": 0.4671737253665924, "learning_rate": 0.0002, "epoch": 2.779991146525011, "step": 3140}, {"loss": 1.5864, "grad_norm": 0.5177035331726074, "learning_rate": 0.0002, "epoch": 2.7888446215139444, "step": 3150}, {"loss": 1.5617, "grad_norm": 0.450989305973053, "learning_rate": 0.0002, "epoch": 2.7976980965028773, "step": 3160}, {"loss": 1.597, "grad_norm": 0.45007848739624023, "learning_rate": 0.0002, "epoch": 2.8065515714918106, "step": 3170}, {"loss": 1.7179, "grad_norm": 0.4600294530391693, "learning_rate": 0.0002, "epoch": 2.815405046480744, "step": 3180}, {"loss": 1.6441, "grad_norm": 0.485628604888916, "learning_rate": 0.0002, "epoch": 2.8242585214696767, "step": 3190}, {"loss": 1.6396, "grad_norm": 0.49811574816703796, "learning_rate": 0.0002, "epoch": 2.83311199645861, "step": 3200}, {"loss": 1.6067, "grad_norm": 0.5012516975402832, "learning_rate": 0.0002, "epoch": 2.8419654714475433, "step": 3210}, {"loss": 1.6188, "grad_norm": 0.4552757740020752, "learning_rate": 0.0002, "epoch": 2.850818946436476, "step": 3220}, {"loss": 1.5993, "grad_norm": 0.4539635479450226, "learning_rate": 0.0002, "epoch": 2.8596724214254094, "step": 3230}, {"loss": 1.5957, "grad_norm": 0.5534685850143433, "learning_rate": 0.0002, "epoch": 2.8685258964143427, "step": 3240}, {"loss": 1.6065, "grad_norm": 0.4570811688899994, "learning_rate": 0.0002, "epoch": 2.8773793714032756, "step": 3250}, {"loss": 1.6016, "grad_norm": 0.48181653022766113, "learning_rate": 0.0002, "epoch": 2.886232846392209, "step": 3260}, {"loss": 1.6574, "grad_norm": 0.4871032238006592, "learning_rate": 0.0002, "epoch": 2.895086321381142, "step": 3270}, {"loss": 1.5626, "grad_norm": 0.4643239676952362, "learning_rate": 0.0002, "epoch": 2.903939796370075, "step": 3280}, {"loss": 1.5981, "grad_norm": 0.5024484395980835, "learning_rate": 0.0002, "epoch": 2.9127932713590083, "step": 3290}, {"loss": 1.5756, "grad_norm": 0.4425384998321533, "learning_rate": 0.0002, "epoch": 2.9216467463479416, "step": 3300}, {"loss": 1.644, "grad_norm": 0.459168016910553, "learning_rate": 0.0002, "epoch": 2.9305002213368745, "step": 3310}, {"loss": 1.6404, "grad_norm": 0.4950717091560364, "learning_rate": 0.0002, "epoch": 2.939353696325808, "step": 3320}, {"loss": 1.652, "grad_norm": 0.4516230523586273, "learning_rate": 0.0002, "epoch": 2.948207171314741, "step": 3330}, {"loss": 1.5917, "grad_norm": 0.49523285031318665, "learning_rate": 0.0002, "epoch": 2.957060646303674, "step": 3340}, {"loss": 1.733, "grad_norm": 0.49282631278038025, "learning_rate": 0.0002, "epoch": 2.9659141212926072, "step": 3350}, {"loss": 1.6519, "grad_norm": 0.45825016498565674, "learning_rate": 0.0002, "epoch": 2.9747675962815405, "step": 3360}, {"loss": 1.6607, "grad_norm": 0.4952891170978546, "learning_rate": 0.0002, "epoch": 2.983621071270474, "step": 3370}, {"loss": 1.5981, "grad_norm": 0.42182639241218567, "learning_rate": 0.0002, "epoch": 2.9924745462594067, "step": 3380}, {"eval_loss": 1.8308420181274414, "eval_runtime": 82.786, "eval_samples_per_second": 6.221, "eval_steps_per_second": 0.785, "epoch": 2.9995573262505535, "step": 3388}, {"loss": 1.5811, "grad_norm": 0.47721418738365173, "learning_rate": 0.0002, "epoch": 3.00132802124834, "step": 3390}, {"loss": 1.5137, "grad_norm": 0.5284923911094666, "learning_rate": 0.0002, "epoch": 3.0101814962372733, "step": 3400}, {"loss": 1.437, "grad_norm": 0.5607061982154846, "learning_rate": 0.0002, "epoch": 3.019034971226206, "step": 3410}, {"loss": 1.4909, "grad_norm": 0.5271363258361816, "learning_rate": 0.0002, "epoch": 3.0278884462151394, "step": 3420}, {"loss": 1.5645, "grad_norm": 0.48660898208618164, "learning_rate": 0.0002, "epoch": 3.0367419212040727, "step": 3430}, {"loss": 1.4754, "grad_norm": 0.5767933130264282, "learning_rate": 0.0002, "epoch": 3.0455953961930056, "step": 3440}, {"loss": 1.4647, "grad_norm": 0.5591282248497009, "learning_rate": 0.0002, "epoch": 3.054448871181939, "step": 3450}, {"loss": 1.5112, "grad_norm": 0.5870814323425293, "learning_rate": 0.0002, "epoch": 3.063302346170872, "step": 3460}, {"loss": 1.4682, "grad_norm": 0.4861546456813812, "learning_rate": 0.0002, "epoch": 3.072155821159805, "step": 3470}, {"loss": 1.4883, "grad_norm": 0.5238925814628601, "learning_rate": 0.0002, "epoch": 3.0810092961487383, "step": 3480}, {"loss": 1.4855, "grad_norm": 0.5521751046180725, "learning_rate": 0.0002, "epoch": 3.0898627711376716, "step": 3490}, {"loss": 1.4454, "grad_norm": 0.5816575884819031, "learning_rate": 0.0002, "epoch": 3.098716246126605, "step": 3500}, {"loss": 1.5113, "grad_norm": 0.5281513333320618, "learning_rate": 0.0002, "epoch": 3.1075697211155378, "step": 3510}, {"loss": 1.4723, "grad_norm": 0.5847303867340088, "learning_rate": 0.0002, "epoch": 3.116423196104471, "step": 3520}, {"loss": 1.5513, "grad_norm": 0.5683517456054688, "learning_rate": 0.0002, "epoch": 3.1252766710934043, "step": 3530}, {"loss": 1.532, "grad_norm": 0.5177015662193298, "learning_rate": 0.0002, "epoch": 3.134130146082337, "step": 3540}, {"loss": 1.4921, "grad_norm": 0.5922423601150513, "learning_rate": 0.0002, "epoch": 3.1429836210712705, "step": 3550}, {"loss": 1.5329, "grad_norm": 0.7018587589263916, "learning_rate": 0.0002, "epoch": 3.151837096060204, "step": 3560}, {"loss": 1.4677, "grad_norm": 0.6152004599571228, "learning_rate": 0.0002, "epoch": 3.1606905710491366, "step": 3570}, {"loss": 1.4288, "grad_norm": 0.5350717902183533, "learning_rate": 0.0002, "epoch": 3.16954404603807, "step": 3580}, {"loss": 1.4739, "grad_norm": 0.5971009731292725, "learning_rate": 0.0002, "epoch": 3.1783975210270032, "step": 3590}, {"loss": 1.541, "grad_norm": 0.7312001585960388, "learning_rate": 0.0002, "epoch": 3.187250996015936, "step": 3600}, {"loss": 1.5803, "grad_norm": 0.6372535228729248, "learning_rate": 0.0002, "epoch": 3.1961044710048694, "step": 3610}, {"loss": 1.4642, "grad_norm": 0.6098020672798157, "learning_rate": 0.0002, "epoch": 3.2049579459938027, "step": 3620}, {"loss": 1.5149, "grad_norm": 0.5506435632705688, "learning_rate": 0.0002, "epoch": 3.2138114209827355, "step": 3630}, {"loss": 1.4338, "grad_norm": 0.6043022274971008, "learning_rate": 0.0002, "epoch": 3.222664895971669, "step": 3640}, {"loss": 1.5351, "grad_norm": 0.5495519042015076, "learning_rate": 0.0002, "epoch": 3.231518370960602, "step": 3650}, {"loss": 1.3879, "grad_norm": 0.5769572257995605, "learning_rate": 0.0002, "epoch": 3.240371845949535, "step": 3660}, {"loss": 1.4604, "grad_norm": 0.6833786964416504, "learning_rate": 0.0002, "epoch": 3.2492253209384683, "step": 3670}, {"loss": 1.5091, "grad_norm": 0.6962856650352478, "learning_rate": 0.0002, "epoch": 3.2580787959274016, "step": 3680}, {"loss": 1.5212, "grad_norm": 0.6553098559379578, "learning_rate": 0.0002, "epoch": 3.2669322709163344, "step": 3690}, {"loss": 1.5416, "grad_norm": 0.5907557010650635, "learning_rate": 0.0002, "epoch": 3.2757857459052677, "step": 3700}, {"loss": 1.5012, "grad_norm": 0.5712862014770508, "learning_rate": 0.0002, "epoch": 3.284639220894201, "step": 3710}, {"loss": 1.5073, "grad_norm": 0.573820948600769, "learning_rate": 0.0002, "epoch": 3.2934926958831343, "step": 3720}, {"loss": 1.544, "grad_norm": 0.6650304198265076, "learning_rate": 0.0002, "epoch": 3.302346170872067, "step": 3730}, {"loss": 1.5069, "grad_norm": 0.5182583928108215, "learning_rate": 0.0002, "epoch": 3.3111996458610005, "step": 3740}, {"loss": 1.5254, "grad_norm": 0.5078902840614319, "learning_rate": 0.0002, "epoch": 3.3200531208499338, "step": 3750}, {"loss": 1.4881, "grad_norm": 0.7062374353408813, "learning_rate": 0.0002, "epoch": 3.3289065958388666, "step": 3760}, {"loss": 1.5017, "grad_norm": 0.5711262822151184, "learning_rate": 0.0002, "epoch": 3.3377600708278, "step": 3770}, {"loss": 1.4982, "grad_norm": 0.5624606013298035, "learning_rate": 0.0002, "epoch": 3.346613545816733, "step": 3780}, {"loss": 1.4515, "grad_norm": 0.6008231043815613, "learning_rate": 0.0002, "epoch": 3.355467020805666, "step": 3790}, {"loss": 1.5038, "grad_norm": 0.6120018362998962, "learning_rate": 0.0002, "epoch": 3.3643204957945994, "step": 3800}, {"loss": 1.4918, "grad_norm": 0.5679979920387268, "learning_rate": 0.0002, "epoch": 3.3731739707835326, "step": 3810}, {"loss": 1.5435, "grad_norm": 0.5613794922828674, "learning_rate": 0.0002, "epoch": 3.3820274457724655, "step": 3820}, {"loss": 1.5319, "grad_norm": 0.5328839421272278, "learning_rate": 0.0002, "epoch": 3.390880920761399, "step": 3830}, {"loss": 1.5262, "grad_norm": 0.5960017442703247, "learning_rate": 0.0002, "epoch": 3.399734395750332, "step": 3840}, {"loss": 1.4227, "grad_norm": 0.5264106392860413, "learning_rate": 0.0002, "epoch": 3.4085878707392654, "step": 3850}, {"loss": 1.4766, "grad_norm": 0.6378359198570251, "learning_rate": 0.0002, "epoch": 3.4174413457281982, "step": 3860}, {"loss": 1.4898, "grad_norm": 0.5792967677116394, "learning_rate": 0.0002, "epoch": 3.4262948207171315, "step": 3870}, {"loss": 1.4914, "grad_norm": 0.6836280822753906, "learning_rate": 0.0002, "epoch": 3.435148295706065, "step": 3880}, {"loss": 1.5002, "grad_norm": 0.6073971390724182, "learning_rate": 0.0002, "epoch": 3.4440017706949977, "step": 3890}, {"loss": 1.4473, "grad_norm": 0.5753195881843567, "learning_rate": 0.0002, "epoch": 3.452855245683931, "step": 3900}, {"loss": 1.5332, "grad_norm": 0.6007646918296814, "learning_rate": 0.0002, "epoch": 3.4617087206728643, "step": 3910}, {"loss": 1.515, "grad_norm": 0.6025636196136475, "learning_rate": 0.0002, "epoch": 3.470562195661797, "step": 3920}, {"loss": 1.4612, "grad_norm": 0.6819562315940857, "learning_rate": 0.0002, "epoch": 3.4794156706507304, "step": 3930}, {"loss": 1.518, "grad_norm": 0.6448395848274231, "learning_rate": 0.0002, "epoch": 3.4882691456396637, "step": 3940}, {"loss": 1.5194, "grad_norm": 0.5712178945541382, "learning_rate": 0.0002, "epoch": 3.4971226206285966, "step": 3950}, {"loss": 1.4757, "grad_norm": 0.6300532817840576, "learning_rate": 0.0002, "epoch": 3.50597609561753, "step": 3960}, {"loss": 1.5142, "grad_norm": 0.6120840907096863, "learning_rate": 0.0002, "epoch": 3.514829570606463, "step": 3970}, {"loss": 1.559, "grad_norm": 0.6887575387954712, "learning_rate": 0.0002, "epoch": 3.523683045595396, "step": 3980}, {"loss": 1.5591, "grad_norm": 0.6970235109329224, "learning_rate": 0.0002, "epoch": 3.5325365205843293, "step": 3990}, {"loss": 1.5198, "grad_norm": 0.5818213820457458, "learning_rate": 0.0002, "epoch": 3.5413899955732626, "step": 4000}, {"loss": 1.5367, "grad_norm": 1.0533310174942017, "learning_rate": 0.0002, "epoch": 3.5502434705621955, "step": 4010}, {"loss": 1.5399, "grad_norm": 0.5444280505180359, "learning_rate": 0.0002, "epoch": 3.5590969455511288, "step": 4020}, {"loss": 1.5573, "grad_norm": 0.6007506847381592, "learning_rate": 0.0002, "epoch": 3.567950420540062, "step": 4030}, {"loss": 1.5059, "grad_norm": 0.6088743805885315, "learning_rate": 0.0002, "epoch": 3.576803895528995, "step": 4040}, {"loss": 1.5174, "grad_norm": 0.5934239029884338, "learning_rate": 0.0002, "epoch": 3.585657370517928, "step": 4050}, {"loss": 1.4938, "grad_norm": 0.605251669883728, "learning_rate": 0.0002, "epoch": 3.5945108455068615, "step": 4060}, {"loss": 1.5142, "grad_norm": 0.5903469920158386, "learning_rate": 0.0002, "epoch": 3.6033643204957944, "step": 4070}, {"loss": 1.5234, "grad_norm": 0.6752413511276245, "learning_rate": 0.0002, "epoch": 3.6122177954847277, "step": 4080}, {"loss": 1.5041, "grad_norm": 0.5810418725013733, "learning_rate": 0.0002, "epoch": 3.621071270473661, "step": 4090}, {"loss": 1.5358, "grad_norm": 0.5918573141098022, "learning_rate": 0.0002, "epoch": 3.629924745462594, "step": 4100}, {"loss": 1.499, "grad_norm": 0.6635358333587646, "learning_rate": 0.0002, "epoch": 3.638778220451527, "step": 4110}, {"loss": 1.5021, "grad_norm": 0.5785038471221924, "learning_rate": 0.0002, "epoch": 3.6476316954404604, "step": 4120}, {"loss": 1.5711, "grad_norm": 0.5837879776954651, "learning_rate": 0.0002, "epoch": 3.6564851704293937, "step": 4130}, {"loss": 1.4273, "grad_norm": 0.6449324488639832, "learning_rate": 0.0002, "epoch": 3.6653386454183265, "step": 4140}, {"loss": 1.4608, "grad_norm": 0.6191908717155457, "learning_rate": 0.0002, "epoch": 3.67419212040726, "step": 4150}, {"loss": 1.4567, "grad_norm": 0.6937987208366394, "learning_rate": 0.0002, "epoch": 3.683045595396193, "step": 4160}, {"loss": 1.4136, "grad_norm": 0.581128716468811, "learning_rate": 0.0002, "epoch": 3.6918990703851264, "step": 4170}, {"loss": 1.4204, "grad_norm": 0.6547803282737732, "learning_rate": 0.0002, "epoch": 3.7007525453740593, "step": 4180}, {"loss": 1.4653, "grad_norm": 0.5961150527000427, "learning_rate": 0.0002, "epoch": 3.7096060203629926, "step": 4190}, {"loss": 1.4755, "grad_norm": 0.6197913885116577, "learning_rate": 0.0002, "epoch": 3.718459495351926, "step": 4200}, {"loss": 1.5191, "grad_norm": 0.688565194606781, "learning_rate": 0.0002, "epoch": 3.7273129703408587, "step": 4210}, {"loss": 1.5618, "grad_norm": 0.5832270979881287, "learning_rate": 0.0002, "epoch": 3.736166445329792, "step": 4220}, {"loss": 1.4747, "grad_norm": 0.5643884539604187, "learning_rate": 0.0002, "epoch": 3.7450199203187253, "step": 4230}, {"loss": 1.5242, "grad_norm": 0.6236484050750732, "learning_rate": 0.0002, "epoch": 3.753873395307658, "step": 4240}, {"loss": 1.576, "grad_norm": 0.5367720127105713, "learning_rate": 0.0002, "epoch": 3.7627268702965915, "step": 4250}, {"loss": 1.5234, "grad_norm": 0.5785109400749207, "learning_rate": 0.0002, "epoch": 3.7715803452855248, "step": 4260}, {"loss": 1.4947, "grad_norm": 0.5698465704917908, "learning_rate": 0.0002, "epoch": 3.7804338202744576, "step": 4270}, {"loss": 1.4769, "grad_norm": 0.5748036503791809, "learning_rate": 0.0002, "epoch": 3.789287295263391, "step": 4280}, {"loss": 1.5503, "grad_norm": 0.608147382736206, "learning_rate": 0.0002, "epoch": 3.798140770252324, "step": 4290}, {"loss": 1.5354, "grad_norm": 0.5820456147193909, "learning_rate": 0.0002, "epoch": 3.806994245241257, "step": 4300}, {"loss": 1.5668, "grad_norm": 0.6325612664222717, "learning_rate": 0.0002, "epoch": 3.8158477202301904, "step": 4310}, {"loss": 1.5295, "grad_norm": 0.6465362310409546, "learning_rate": 0.0002, "epoch": 3.8247011952191237, "step": 4320}, {"loss": 1.5048, "grad_norm": 0.5630854368209839, "learning_rate": 0.0002, "epoch": 3.8335546702080565, "step": 4330}, {"loss": 1.5636, "grad_norm": 0.6181462407112122, "learning_rate": 0.0002, "epoch": 3.84240814519699, "step": 4340}, {"loss": 1.5113, "grad_norm": 0.6207571029663086, "learning_rate": 0.0002, "epoch": 3.851261620185923, "step": 4350}, {"loss": 1.5424, "grad_norm": 0.6092919111251831, "learning_rate": 0.0002, "epoch": 3.860115095174856, "step": 4360}, {"loss": 1.5214, "grad_norm": 0.6140493750572205, "learning_rate": 0.0002, "epoch": 3.8689685701637893, "step": 4370}, {"loss": 1.5574, "grad_norm": 0.611575722694397, "learning_rate": 0.0002, "epoch": 3.8778220451527226, "step": 4380}, {"loss": 1.5563, "grad_norm": 0.6288794279098511, "learning_rate": 0.0002, "epoch": 3.8866755201416554, "step": 4390}, {"loss": 1.4967, "grad_norm": 0.6518979072570801, "learning_rate": 0.0002, "epoch": 3.8955289951305887, "step": 4400}, {"loss": 1.5366, "grad_norm": 0.6144753098487854, "learning_rate": 0.0002, "epoch": 3.904382470119522, "step": 4410}, {"loss": 1.6285, "grad_norm": 0.7034937143325806, "learning_rate": 0.0002, "epoch": 3.913235945108455, "step": 4420}, {"loss": 1.4978, "grad_norm": 0.5713187456130981, "learning_rate": 0.0002, "epoch": 3.922089420097388, "step": 4430}, {"loss": 1.5532, "grad_norm": 0.6187576651573181, "learning_rate": 0.0002, "epoch": 3.9309428950863214, "step": 4440}, {"loss": 1.551, "grad_norm": 0.6439383029937744, "learning_rate": 0.0002, "epoch": 3.9397963700752543, "step": 4450}, {"loss": 1.5073, "grad_norm": 0.6133334636688232, "learning_rate": 0.0002, "epoch": 3.9486498450641876, "step": 4460}, {"loss": 1.538, "grad_norm": 0.593463659286499, "learning_rate": 0.0002, "epoch": 3.957503320053121, "step": 4470}, {"loss": 1.5636, "grad_norm": 0.6261998414993286, "learning_rate": 0.0002, "epoch": 3.9663567950420537, "step": 4480}, {"loss": 1.4888, "grad_norm": 0.6153767704963684, "learning_rate": 0.0002, "epoch": 3.975210270030987, "step": 4490}, {"loss": 1.4986, "grad_norm": 0.6184002757072449, "learning_rate": 0.0002, "epoch": 3.9840637450199203, "step": 4500}, {"loss": 1.5134, "grad_norm": 0.5212734341621399, "learning_rate": 0.0002, "epoch": 3.9929172200088536, "step": 4510}, {"eval_loss": 1.8745536804199219, "eval_runtime": 83.0125, "eval_samples_per_second": 6.204, "eval_steps_per_second": 0.783, "epoch": 4.0, "step": 4518}, {"loss": 1.4708, "grad_norm": 0.5871603488922119, "learning_rate": 0.0002, "epoch": 4.001770694997787, "step": 4520}, {"loss": 1.4139, "grad_norm": 0.6746091842651367, "learning_rate": 0.0002, "epoch": 4.01062416998672, "step": 4530}, {"loss": 1.3625, "grad_norm": 0.6159639358520508, "learning_rate": 0.0002, "epoch": 4.019477644975653, "step": 4540}, {"loss": 1.3766, "grad_norm": 0.7529398202896118, "learning_rate": 0.0002, "epoch": 4.028331119964586, "step": 4550}, {"loss": 1.3202, "grad_norm": 0.788398027420044, "learning_rate": 0.0002, "epoch": 4.037184594953519, "step": 4560}, {"loss": 1.4254, "grad_norm": 0.9679850935935974, "learning_rate": 0.0002, "epoch": 4.046038069942452, "step": 4570}, {"loss": 1.2911, "grad_norm": 0.6305310130119324, "learning_rate": 0.0002, "epoch": 4.054891544931386, "step": 4580}, {"loss": 1.3525, "grad_norm": 0.8557451963424683, "learning_rate": 0.0002, "epoch": 4.063745019920319, "step": 4590}, {"loss": 1.3901, "grad_norm": 0.741518497467041, "learning_rate": 0.0002, "epoch": 4.0725984949092515, "step": 4600}, {"loss": 1.3374, "grad_norm": 0.6573862433433533, "learning_rate": 0.0002, "epoch": 4.081451969898185, "step": 4610}, {"loss": 1.3341, "grad_norm": 0.6926319599151611, "learning_rate": 0.0002, "epoch": 4.090305444887118, "step": 4620}, {"loss": 1.4176, "grad_norm": 0.9212626218795776, "learning_rate": 0.0002, "epoch": 4.099158919876051, "step": 4630}, {"loss": 1.3402, "grad_norm": 0.7167867422103882, "learning_rate": 0.0002, "epoch": 4.108012394864985, "step": 4640}, {"loss": 1.3333, "grad_norm": 0.6691595911979675, "learning_rate": 0.0002, "epoch": 4.116865869853918, "step": 4650}, {"loss": 1.247, "grad_norm": 0.8708247542381287, "learning_rate": 0.0002, "epoch": 4.12571934484285, "step": 4660}, {"loss": 1.3599, "grad_norm": 0.8612170219421387, "learning_rate": 0.0002, "epoch": 4.134572819831784, "step": 4670}, {"loss": 1.3418, "grad_norm": 0.7688325047492981, "learning_rate": 0.0002, "epoch": 4.143426294820717, "step": 4680}, {"loss": 1.4349, "grad_norm": 0.7606917023658752, "learning_rate": 0.0002, "epoch": 4.152279769809651, "step": 4690}, {"loss": 1.3521, "grad_norm": 0.8241282105445862, "learning_rate": 0.0002, "epoch": 4.161133244798584, "step": 4700}, {"loss": 1.3325, "grad_norm": 0.7480464577674866, "learning_rate": 0.0002, "epoch": 4.1699867197875164, "step": 4710}, {"loss": 1.4027, "grad_norm": 0.7092460989952087, "learning_rate": 0.0002, "epoch": 4.17884019477645, "step": 4720}, {"loss": 1.4005, "grad_norm": 0.8782108426094055, "learning_rate": 0.0002, "epoch": 4.187693669765383, "step": 4730}, {"loss": 1.3626, "grad_norm": 0.6875300407409668, "learning_rate": 0.0002, "epoch": 4.196547144754316, "step": 4740}, {"loss": 1.3798, "grad_norm": 0.7713887691497803, "learning_rate": 0.0002, "epoch": 4.20540061974325, "step": 4750}, {"loss": 1.3822, "grad_norm": 0.8270819783210754, "learning_rate": 0.0002, "epoch": 4.2142540947321825, "step": 4760}, {"loss": 1.3559, "grad_norm": 0.7109288573265076, "learning_rate": 0.0002, "epoch": 4.223107569721115, "step": 4770}, {"loss": 1.3948, "grad_norm": 0.7209359407424927, "learning_rate": 0.0002, "epoch": 4.231961044710049, "step": 4780}, {"loss": 1.3691, "grad_norm": 0.7142833471298218, "learning_rate": 0.0002, "epoch": 4.240814519698982, "step": 4790}, {"loss": 1.3654, "grad_norm": 0.8526809811592102, "learning_rate": 0.0002, "epoch": 4.249667994687915, "step": 4800}, {"loss": 1.3819, "grad_norm": 0.7064695954322815, "learning_rate": 0.0002, "epoch": 4.2585214696768485, "step": 4810}, {"loss": 1.3333, "grad_norm": 0.7646124362945557, "learning_rate": 0.0002, "epoch": 4.267374944665781, "step": 4820}, {"loss": 1.4247, "grad_norm": 0.7377115488052368, "learning_rate": 0.0002, "epoch": 4.276228419654714, "step": 4830}, {"loss": 1.3683, "grad_norm": 0.7308453321456909, "learning_rate": 0.0002, "epoch": 4.285081894643648, "step": 4840}, {"loss": 1.3653, "grad_norm": 0.6687684059143066, "learning_rate": 0.0002, "epoch": 4.293935369632581, "step": 4850}, {"loss": 1.3538, "grad_norm": 0.7447634339332581, "learning_rate": 0.0002, "epoch": 4.302788844621514, "step": 4860}, {"loss": 1.3842, "grad_norm": 0.7661601305007935, "learning_rate": 0.0002, "epoch": 4.311642319610447, "step": 4870}, {"loss": 1.3783, "grad_norm": 0.7492215037345886, "learning_rate": 0.0002, "epoch": 4.32049579459938, "step": 4880}, {"loss": 1.4089, "grad_norm": 0.9554458856582642, "learning_rate": 0.0002, "epoch": 4.329349269588313, "step": 4890}, {"loss": 1.3582, "grad_norm": 0.7409822940826416, "learning_rate": 0.0002, "epoch": 4.338202744577247, "step": 4900}, {"loss": 1.2581, "grad_norm": 0.9848645329475403, "learning_rate": 0.0002, "epoch": 4.34705621956618, "step": 4910}, {"loss": 1.3809, "grad_norm": 0.803995668888092, "learning_rate": 0.0002, "epoch": 4.355909694555113, "step": 4920}, {"loss": 1.3585, "grad_norm": 0.7480606436729431, "learning_rate": 0.0002, "epoch": 4.364763169544046, "step": 4930}, {"loss": 1.4092, "grad_norm": 0.7018141150474548, "learning_rate": 0.0002, "epoch": 4.373616644532979, "step": 4940}, {"loss": 1.4034, "grad_norm": 0.7684932351112366, "learning_rate": 0.0002, "epoch": 4.382470119521912, "step": 4950}, {"loss": 1.3937, "grad_norm": 0.7849185466766357, "learning_rate": 0.0002, "epoch": 4.391323594510846, "step": 4960}, {"loss": 1.3763, "grad_norm": 0.7858862280845642, "learning_rate": 0.0002, "epoch": 4.400177069499779, "step": 4970}, {"loss": 1.3901, "grad_norm": 0.8270778059959412, "learning_rate": 0.0002, "epoch": 4.4090305444887115, "step": 4980}, {"loss": 1.445, "grad_norm": 0.8464101552963257, "learning_rate": 0.0002, "epoch": 4.417884019477645, "step": 4990}, {"loss": 1.3586, "grad_norm": 0.85670405626297, "learning_rate": 0.0002, "epoch": 4.426737494466578, "step": 5000}, {"loss": 1.4203, "grad_norm": 0.8656655550003052, "learning_rate": 0.0002, "epoch": 4.435590969455511, "step": 5010}, {"loss": 1.3426, "grad_norm": 0.7605292201042175, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 5020}, {"loss": 1.3803, "grad_norm": 0.7682471871376038, "learning_rate": 0.0002, "epoch": 4.4532979194333775, "step": 5030}, {"loss": 1.3432, "grad_norm": 0.7209102511405945, "learning_rate": 0.0002, "epoch": 4.46215139442231, "step": 5040}, {"loss": 1.5126, "grad_norm": 0.8259989023208618, "learning_rate": 0.0002, "epoch": 4.471004869411244, "step": 5050}, {"loss": 1.3709, "grad_norm": 0.7342197895050049, "learning_rate": 0.0002, "epoch": 4.479858344400177, "step": 5060}, {"loss": 1.4196, "grad_norm": 0.7869040369987488, "learning_rate": 0.0002, "epoch": 4.48871181938911, "step": 5070}, {"loss": 1.3734, "grad_norm": 0.7906143665313721, "learning_rate": 0.0002, "epoch": 4.4975652943780435, "step": 5080}, {"loss": 1.3555, "grad_norm": 0.7336861491203308, "learning_rate": 0.0002, "epoch": 4.506418769366976, "step": 5090}, {"loss": 1.3768, "grad_norm": 0.8264166712760925, "learning_rate": 0.0002, "epoch": 4.515272244355909, "step": 5100}, {"loss": 1.3822, "grad_norm": 0.8144693970680237, "learning_rate": 0.0002, "epoch": 4.524125719344843, "step": 5110}, {"loss": 1.3044, "grad_norm": 0.8257269263267517, "learning_rate": 0.0002, "epoch": 4.532979194333776, "step": 5120}, {"loss": 1.3501, "grad_norm": 0.8838174343109131, "learning_rate": 0.0002, "epoch": 4.541832669322709, "step": 5130}, {"loss": 1.3464, "grad_norm": 0.7081145644187927, "learning_rate": 0.0002, "epoch": 4.550686144311642, "step": 5140}, {"loss": 1.342, "grad_norm": 0.7137823700904846, "learning_rate": 0.0002, "epoch": 4.559539619300575, "step": 5150}, {"loss": 1.3788, "grad_norm": 0.7890386581420898, "learning_rate": 0.0002, "epoch": 4.568393094289509, "step": 5160}, {"loss": 1.3368, "grad_norm": 0.6418015360832214, "learning_rate": 0.0002, "epoch": 4.577246569278442, "step": 5170}, {"loss": 1.3892, "grad_norm": 0.768373966217041, "learning_rate": 0.0002, "epoch": 4.586100044267375, "step": 5180}, {"loss": 1.3953, "grad_norm": 0.6934067606925964, "learning_rate": 0.0002, "epoch": 4.5949535192563085, "step": 5190}, {"loss": 1.3782, "grad_norm": 0.9430719017982483, "learning_rate": 0.0002, "epoch": 4.603806994245241, "step": 5200}, {"loss": 1.3981, "grad_norm": 0.880264163017273, "learning_rate": 0.0002, "epoch": 4.612660469234174, "step": 5210}, {"loss": 1.3506, "grad_norm": 0.7584623098373413, "learning_rate": 0.0002, "epoch": 4.621513944223108, "step": 5220}, {"loss": 1.3973, "grad_norm": 0.7974506616592407, "learning_rate": 0.0002, "epoch": 4.630367419212041, "step": 5230}, {"loss": 1.3818, "grad_norm": 0.8812133073806763, "learning_rate": 0.0002, "epoch": 4.639220894200974, "step": 5240}, {"loss": 1.4002, "grad_norm": 0.8968724012374878, "learning_rate": 0.0002, "epoch": 4.648074369189907, "step": 5250}, {"loss": 1.3327, "grad_norm": 0.7317764759063721, "learning_rate": 0.0002, "epoch": 4.65692784417884, "step": 5260}, {"loss": 1.4363, "grad_norm": 0.7415484189987183, "learning_rate": 0.0002, "epoch": 4.665781319167773, "step": 5270}, {"loss": 1.3673, "grad_norm": 0.7867009043693542, "learning_rate": 0.0002, "epoch": 4.674634794156707, "step": 5280}, {"loss": 1.4246, "grad_norm": 0.6895416378974915, "learning_rate": 0.0002, "epoch": 4.68348826914564, "step": 5290}, {"loss": 1.3438, "grad_norm": 0.7324506640434265, "learning_rate": 0.0002, "epoch": 4.6923417441345725, "step": 5300}, {"loss": 1.4072, "grad_norm": 0.7383193969726562, "learning_rate": 0.0002, "epoch": 4.701195219123506, "step": 5310}, {"loss": 1.3269, "grad_norm": 0.8254916071891785, "learning_rate": 0.0002, "epoch": 4.710048694112439, "step": 5320}, {"loss": 1.4317, "grad_norm": 0.8161033987998962, "learning_rate": 0.0002, "epoch": 4.718902169101372, "step": 5330}, {"loss": 1.3623, "grad_norm": 0.7664386034011841, "learning_rate": 0.0002, "epoch": 4.727755644090306, "step": 5340}, {"loss": 1.4293, "grad_norm": 0.7465475797653198, "learning_rate": 0.0002, "epoch": 4.7366091190792385, "step": 5350}, {"loss": 1.3435, "grad_norm": 0.7810078263282776, "learning_rate": 0.0002, "epoch": 4.745462594068171, "step": 5360}, {"loss": 1.4489, "grad_norm": 0.7428439855575562, "learning_rate": 0.0002, "epoch": 4.754316069057105, "step": 5370}, {"loss": 1.3607, "grad_norm": 0.9548320174217224, "learning_rate": 0.0002, "epoch": 4.763169544046038, "step": 5380}, {"loss": 1.3398, "grad_norm": 0.7959533333778381, "learning_rate": 0.0002, "epoch": 4.772023019034972, "step": 5390}, {"loss": 1.3448, "grad_norm": 0.747473418712616, "learning_rate": 0.0002, "epoch": 4.780876494023905, "step": 5400}, {"loss": 1.3954, "grad_norm": 0.7863122820854187, "learning_rate": 0.0002, "epoch": 4.789729969012837, "step": 5410}, {"loss": 1.4166, "grad_norm": 0.7769626379013062, "learning_rate": 0.0002, "epoch": 4.798583444001771, "step": 5420}, {"loss": 1.4484, "grad_norm": 0.8551191091537476, "learning_rate": 0.0002, "epoch": 4.807436918990704, "step": 5430}, {"loss": 1.4314, "grad_norm": 0.8364850878715515, "learning_rate": 0.0002, "epoch": 4.816290393979637, "step": 5440}, {"loss": 1.4028, "grad_norm": 0.7458856701850891, "learning_rate": 0.0002, "epoch": 4.825143868968571, "step": 5450}, {"loss": 1.3923, "grad_norm": 0.7558291554450989, "learning_rate": 0.0002, "epoch": 4.8339973439575035, "step": 5460}, {"loss": 1.3343, "grad_norm": 0.8396534323692322, "learning_rate": 0.0002, "epoch": 4.842850818946436, "step": 5470}, {"loss": 1.3853, "grad_norm": 0.7790794968605042, "learning_rate": 0.0002, "epoch": 4.85170429393537, "step": 5480}, {"loss": 1.406, "grad_norm": 0.8607641458511353, "learning_rate": 0.0002, "epoch": 4.860557768924303, "step": 5490}, {"loss": 1.4011, "grad_norm": 0.828134298324585, "learning_rate": 0.0002, "epoch": 4.869411243913236, "step": 5500}, {"loss": 1.4089, "grad_norm": 0.8783106803894043, "learning_rate": 0.0002, "epoch": 4.8782647189021695, "step": 5510}, {"loss": 1.4565, "grad_norm": 0.7476183176040649, "learning_rate": 0.0002, "epoch": 4.887118193891102, "step": 5520}, {"loss": 1.3974, "grad_norm": 0.8023254871368408, "learning_rate": 0.0002, "epoch": 4.895971668880035, "step": 5530}, {"loss": 1.2979, "grad_norm": 0.8021706938743591, "learning_rate": 0.0002, "epoch": 4.904825143868969, "step": 5540}, {"loss": 1.4139, "grad_norm": 0.7873618602752686, "learning_rate": 0.0002, "epoch": 4.913678618857902, "step": 5550}, {"loss": 1.4393, "grad_norm": 0.7181428670883179, "learning_rate": 0.0002, "epoch": 4.922532093846835, "step": 5560}, {"loss": 1.3968, "grad_norm": 0.7464273571968079, "learning_rate": 0.0002, "epoch": 4.931385568835768, "step": 5570}, {"loss": 1.3184, "grad_norm": 0.7433671355247498, "learning_rate": 0.0002, "epoch": 4.940239043824701, "step": 5580}, {"loss": 1.4174, "grad_norm": 0.7571114301681519, "learning_rate": 0.0002, "epoch": 4.949092518813634, "step": 5590}, {"loss": 1.4418, "grad_norm": 0.7811630964279175, "learning_rate": 0.0002, "epoch": 4.957945993802568, "step": 5600}, {"loss": 1.4288, "grad_norm": 0.7609148621559143, "learning_rate": 0.0002, "epoch": 4.966799468791501, "step": 5610}, {"loss": 1.3786, "grad_norm": 0.7324382066726685, "learning_rate": 0.0002, "epoch": 4.9756529437804335, "step": 5620}, {"loss": 1.4557, "grad_norm": 0.9249559640884399, "learning_rate": 0.0002, "epoch": 4.984506418769367, "step": 5630}, {"loss": 1.4064, "grad_norm": 0.7852522134780884, "learning_rate": 0.0002, "epoch": 4.9933598937583, "step": 5640}, {"eval_loss": 1.9384633302688599, "eval_runtime": 82.6042, "eval_samples_per_second": 6.235, "eval_steps_per_second": 0.787, "epoch": 4.999557326250553, "step": 5647}, {"loss": 1.4261, "grad_norm": 0.8052749037742615, "learning_rate": 0.0002, "epoch": 5.002213368747233, "step": 5650}, {"loss": 1.1967, "grad_norm": 1.380603551864624, "learning_rate": 0.0002, "epoch": 5.011066843736167, "step": 5660}, {"loss": 1.1871, "grad_norm": 0.9197829365730286, "learning_rate": 0.0002, "epoch": 5.0199203187251, "step": 5670}, {"loss": 1.1966, "grad_norm": 0.9338570833206177, "learning_rate": 0.0002, "epoch": 5.028773793714032, "step": 5680}, {"loss": 1.1866, "grad_norm": 1.0464060306549072, "learning_rate": 0.0002, "epoch": 5.037627268702966, "step": 5690}, {"loss": 1.2211, "grad_norm": 0.9055638909339905, "learning_rate": 0.0002, "epoch": 5.046480743691899, "step": 5700}, {"loss": 1.1987, "grad_norm": 0.9494627714157104, "learning_rate": 0.0002, "epoch": 5.055334218680832, "step": 5710}, {"loss": 1.2647, "grad_norm": 0.9680962562561035, "learning_rate": 0.0002, "epoch": 5.064187693669766, "step": 5720}, {"loss": 1.2452, "grad_norm": 1.0254695415496826, "learning_rate": 0.0002, "epoch": 5.0730411686586985, "step": 5730}, {"loss": 1.2006, "grad_norm": 0.9306758642196655, "learning_rate": 0.0002, "epoch": 5.081894643647631, "step": 5740}, {"loss": 1.2254, "grad_norm": 1.0620356798171997, "learning_rate": 0.0002, "epoch": 5.090748118636565, "step": 5750}, {"loss": 1.2628, "grad_norm": 1.0401700735092163, "learning_rate": 0.0002, "epoch": 5.099601593625498, "step": 5760}, {"loss": 1.1976, "grad_norm": 0.9916906952857971, "learning_rate": 0.0002, "epoch": 5.108455068614431, "step": 5770}, {"loss": 1.2847, "grad_norm": 0.8387252688407898, "learning_rate": 0.0002, "epoch": 5.1173085436033645, "step": 5780}, {"loss": 1.2472, "grad_norm": 0.9870850443840027, "learning_rate": 0.0002, "epoch": 5.126162018592297, "step": 5790}, {"loss": 1.1902, "grad_norm": 0.9204064011573792, "learning_rate": 0.0002, "epoch": 5.13501549358123, "step": 5800}, {"loss": 1.2266, "grad_norm": 0.9951931834220886, "learning_rate": 0.0002, "epoch": 5.143868968570164, "step": 5810}, {"loss": 1.2113, "grad_norm": 0.9745809435844421, "learning_rate": 0.0002, "epoch": 5.152722443559097, "step": 5820}, {"loss": 1.2549, "grad_norm": 0.9467785954475403, "learning_rate": 0.0002, "epoch": 5.16157591854803, "step": 5830}, {"loss": 1.2309, "grad_norm": 1.0451668500900269, "learning_rate": 0.0002, "epoch": 5.170429393536963, "step": 5840}, {"loss": 1.2215, "grad_norm": 0.9740142822265625, "learning_rate": 0.0002, "epoch": 5.179282868525896, "step": 5850}, {"loss": 1.2137, "grad_norm": 1.2158266305923462, "learning_rate": 0.0002, "epoch": 5.18813634351483, "step": 5860}, {"loss": 1.1631, "grad_norm": 1.0795036554336548, "learning_rate": 0.0002, "epoch": 5.196989818503763, "step": 5870}, {"loss": 1.1448, "grad_norm": 0.9578470587730408, "learning_rate": 0.0002, "epoch": 5.205843293492696, "step": 5880}, {"loss": 1.2183, "grad_norm": 0.8887509703636169, "learning_rate": 0.0002, "epoch": 5.214696768481629, "step": 5890}, {"loss": 1.1991, "grad_norm": 1.171006441116333, "learning_rate": 0.0002, "epoch": 5.223550243470562, "step": 5900}, {"loss": 1.1781, "grad_norm": 0.9016029834747314, "learning_rate": 0.0002, "epoch": 5.232403718459495, "step": 5910}, {"loss": 1.2057, "grad_norm": 1.173136830329895, "learning_rate": 0.0002, "epoch": 5.241257193448429, "step": 5920}, {"loss": 1.2856, "grad_norm": 0.8760318160057068, "learning_rate": 0.0002, "epoch": 5.250110668437362, "step": 5930}, {"loss": 1.2301, "grad_norm": 0.8998854160308838, "learning_rate": 0.0002, "epoch": 5.258964143426295, "step": 5940}, {"loss": 1.3058, "grad_norm": 1.017175316810608, "learning_rate": 0.0002, "epoch": 5.267817618415228, "step": 5950}, {"loss": 1.2552, "grad_norm": 0.8646609783172607, "learning_rate": 0.0002, "epoch": 5.276671093404161, "step": 5960}, {"loss": 1.2044, "grad_norm": 1.0030627250671387, "learning_rate": 0.0002, "epoch": 5.285524568393094, "step": 5970}, {"loss": 1.2365, "grad_norm": 0.975911557674408, "learning_rate": 0.0002, "epoch": 5.294378043382028, "step": 5980}, {"loss": 1.2307, "grad_norm": 0.9576130509376526, "learning_rate": 0.0002, "epoch": 5.303231518370961, "step": 5990}, {"loss": 1.2681, "grad_norm": 0.9566167593002319, "learning_rate": 0.0002, "epoch": 5.3120849933598935, "step": 6000}, {"loss": 1.2029, "grad_norm": 0.9200350642204285, "learning_rate": 0.0002, "epoch": 5.320938468348827, "step": 6010}, {"loss": 1.1871, "grad_norm": 1.0491118431091309, "learning_rate": 0.0002, "epoch": 5.32979194333776, "step": 6020}, {"loss": 1.2531, "grad_norm": 1.1199153661727905, "learning_rate": 0.0002, "epoch": 5.338645418326693, "step": 6030}, {"loss": 1.265, "grad_norm": 1.015252947807312, "learning_rate": 0.0002, "epoch": 5.347498893315627, "step": 6040}, {"loss": 1.2208, "grad_norm": 1.1076666116714478, "learning_rate": 0.0002, "epoch": 5.3563523683045595, "step": 6050}, {"loss": 1.1953, "grad_norm": 0.9224653840065002, "learning_rate": 0.0002, "epoch": 5.365205843293492, "step": 6060}, {"loss": 1.2045, "grad_norm": 1.0079779624938965, "learning_rate": 0.0002, "epoch": 5.374059318282426, "step": 6070}, {"loss": 1.2612, "grad_norm": 0.9627894759178162, "learning_rate": 0.0002, "epoch": 5.382912793271359, "step": 6080}, {"loss": 1.3116, "grad_norm": 1.0503166913986206, "learning_rate": 0.0002, "epoch": 5.391766268260292, "step": 6090}, {"loss": 1.2565, "grad_norm": 0.912736713886261, "learning_rate": 0.0002, "epoch": 5.400619743249226, "step": 6100}, {"loss": 1.204, "grad_norm": 1.2552032470703125, "learning_rate": 0.0002, "epoch": 5.409473218238158, "step": 6110}, {"loss": 1.2738, "grad_norm": 0.986230731010437, "learning_rate": 0.0002, "epoch": 5.418326693227091, "step": 6120}, {"loss": 1.3301, "grad_norm": 0.9869757294654846, "learning_rate": 0.0002, "epoch": 5.427180168216025, "step": 6130}, {"loss": 1.241, "grad_norm": 1.012027621269226, "learning_rate": 0.0002, "epoch": 5.436033643204958, "step": 6140}, {"loss": 1.224, "grad_norm": 0.8855568170547485, "learning_rate": 0.0002, "epoch": 5.444887118193891, "step": 6150}, {"loss": 1.2539, "grad_norm": 1.1522414684295654, "learning_rate": 0.0002, "epoch": 5.4537405931828244, "step": 6160}, {"loss": 1.2402, "grad_norm": 1.2448474168777466, "learning_rate": 0.0002, "epoch": 5.462594068171757, "step": 6170}, {"loss": 1.179, "grad_norm": 1.0362223386764526, "learning_rate": 0.0002, "epoch": 5.471447543160691, "step": 6180}, {"loss": 1.2351, "grad_norm": 0.9363031983375549, "learning_rate": 0.0002, "epoch": 5.480301018149624, "step": 6190}, {"loss": 1.2394, "grad_norm": 0.8852020502090454, "learning_rate": 0.0002, "epoch": 5.489154493138557, "step": 6200}, {"loss": 1.311, "grad_norm": 0.8577062487602234, "learning_rate": 0.0002, "epoch": 5.4980079681274905, "step": 6210}, {"loss": 1.2547, "grad_norm": 0.9351891875267029, "learning_rate": 0.0002, "epoch": 5.506861443116423, "step": 6220}, {"loss": 1.2804, "grad_norm": 1.0031992197036743, "learning_rate": 0.0002, "epoch": 5.515714918105356, "step": 6230}, {"loss": 1.219, "grad_norm": 0.9935104250907898, "learning_rate": 0.0002, "epoch": 5.52456839309429, "step": 6240}, {"loss": 1.2756, "grad_norm": 1.1086243391036987, "learning_rate": 0.0002, "epoch": 5.533421868083223, "step": 6250}, {"loss": 1.2751, "grad_norm": 0.990772545337677, "learning_rate": 0.0002, "epoch": 5.542275343072156, "step": 6260}, {"loss": 1.2756, "grad_norm": 0.9317597150802612, "learning_rate": 0.0002, "epoch": 5.551128818061089, "step": 6270}, {"loss": 1.2095, "grad_norm": 0.9657552242279053, "learning_rate": 0.0002, "epoch": 5.559982293050022, "step": 6280}, {"loss": 1.2435, "grad_norm": 1.0271565914154053, "learning_rate": 0.0002, "epoch": 5.568835768038955, "step": 6290}, {"loss": 1.2283, "grad_norm": 0.916253924369812, "learning_rate": 0.0002, "epoch": 5.577689243027889, "step": 6300}, {"loss": 1.2648, "grad_norm": 1.0083940029144287, "learning_rate": 0.0002, "epoch": 5.586542718016822, "step": 6310}, {"loss": 1.2904, "grad_norm": 0.9740358591079712, "learning_rate": 0.0002, "epoch": 5.5953961930057545, "step": 6320}, {"loss": 1.2507, "grad_norm": 0.9645405411720276, "learning_rate": 0.0002, "epoch": 5.604249667994688, "step": 6330}, {"loss": 1.2845, "grad_norm": 0.9677100777626038, "learning_rate": 0.0002, "epoch": 5.613103142983621, "step": 6340}, {"loss": 1.2936, "grad_norm": 0.9706602692604065, "learning_rate": 0.0002, "epoch": 5.621956617972554, "step": 6350}, {"loss": 1.2541, "grad_norm": 1.1492316722869873, "learning_rate": 0.0002, "epoch": 5.630810092961488, "step": 6360}, {"loss": 1.2242, "grad_norm": 0.8857277035713196, "learning_rate": 0.0002, "epoch": 5.639663567950421, "step": 6370}, {"loss": 1.2178, "grad_norm": 1.0363037586212158, "learning_rate": 0.0002, "epoch": 5.648517042939353, "step": 6380}, {"loss": 1.1838, "grad_norm": 0.9621800780296326, "learning_rate": 0.0002, "epoch": 5.657370517928287, "step": 6390}, {"loss": 1.2472, "grad_norm": 0.9937820434570312, "learning_rate": 0.0002, "epoch": 5.66622399291722, "step": 6400}, {"loss": 1.2523, "grad_norm": 0.9491283297538757, "learning_rate": 0.0002, "epoch": 5.675077467906153, "step": 6410}, {"loss": 1.2539, "grad_norm": 0.9429448246955872, "learning_rate": 0.0002, "epoch": 5.683930942895087, "step": 6420}, {"loss": 1.1663, "grad_norm": 0.9808844327926636, "learning_rate": 0.0002, "epoch": 5.6927844178840195, "step": 6430}, {"loss": 1.2574, "grad_norm": 0.8191056847572327, "learning_rate": 0.0002, "epoch": 5.701637892872952, "step": 6440}, {"loss": 1.2659, "grad_norm": 1.1118974685668945, "learning_rate": 0.0002, "epoch": 5.710491367861886, "step": 6450}, {"loss": 1.2192, "grad_norm": 0.9030969142913818, "learning_rate": 0.0002, "epoch": 5.719344842850819, "step": 6460}, {"loss": 1.301, "grad_norm": 1.0509997606277466, "learning_rate": 0.0002, "epoch": 5.728198317839752, "step": 6470}, {"loss": 1.217, "grad_norm": 1.0369981527328491, "learning_rate": 0.0002, "epoch": 5.7370517928286855, "step": 6480}, {"loss": 1.2518, "grad_norm": 0.8626071214675903, "learning_rate": 0.0002, "epoch": 5.745905267817618, "step": 6490}, {"loss": 1.2446, "grad_norm": 1.0448849201202393, "learning_rate": 0.0002, "epoch": 5.754758742806551, "step": 6500}, {"loss": 1.2698, "grad_norm": 0.9333119988441467, "learning_rate": 0.0002, "epoch": 5.763612217795485, "step": 6510}, {"loss": 1.2655, "grad_norm": 0.8533532023429871, "learning_rate": 0.0002, "epoch": 5.772465692784418, "step": 6520}, {"loss": 1.3037, "grad_norm": 0.9774261713027954, "learning_rate": 0.0002, "epoch": 5.781319167773351, "step": 6530}, {"loss": 1.2031, "grad_norm": 0.9841071963310242, "learning_rate": 0.0002, "epoch": 5.790172642762284, "step": 6540}, {"loss": 1.2767, "grad_norm": 0.9891805052757263, "learning_rate": 0.0002, "epoch": 5.799026117751217, "step": 6550}, {"loss": 1.3373, "grad_norm": 0.9633952379226685, "learning_rate": 0.0002, "epoch": 5.80787959274015, "step": 6560}, {"loss": 1.1939, "grad_norm": 1.327634334564209, "learning_rate": 0.0002, "epoch": 5.816733067729084, "step": 6570}, {"loss": 1.2985, "grad_norm": 0.9805197715759277, "learning_rate": 0.0002, "epoch": 5.825586542718017, "step": 6580}, {"loss": 1.1933, "grad_norm": 1.020957589149475, "learning_rate": 0.0002, "epoch": 5.8344400177069495, "step": 6590}, {"loss": 1.2582, "grad_norm": 0.9694032669067383, "learning_rate": 0.0002, "epoch": 5.843293492695883, "step": 6600}, {"loss": 1.2671, "grad_norm": 0.8980914354324341, "learning_rate": 0.0002, "epoch": 5.852146967684816, "step": 6610}, {"loss": 1.3391, "grad_norm": 0.8312330842018127, "learning_rate": 0.0002, "epoch": 5.861000442673749, "step": 6620}, {"loss": 1.3301, "grad_norm": 0.9773725271224976, "learning_rate": 0.0002, "epoch": 5.869853917662683, "step": 6630}, {"loss": 1.2697, "grad_norm": 0.9684233665466309, "learning_rate": 0.0002, "epoch": 5.878707392651616, "step": 6640}, {"loss": 1.2866, "grad_norm": 0.8436519503593445, "learning_rate": 0.0002, "epoch": 5.887560867640548, "step": 6650}, {"loss": 1.2213, "grad_norm": 0.9129888415336609, "learning_rate": 0.0002, "epoch": 5.896414342629482, "step": 6660}, {"loss": 1.3272, "grad_norm": 0.8871369957923889, "learning_rate": 0.0002, "epoch": 5.905267817618415, "step": 6670}, {"loss": 1.3758, "grad_norm": 0.9544420838356018, "learning_rate": 0.0002, "epoch": 5.914121292607349, "step": 6680}, {"loss": 1.2954, "grad_norm": 0.9607448577880859, "learning_rate": 0.0002, "epoch": 5.922974767596282, "step": 6690}, {"loss": 1.2448, "grad_norm": 0.9675708413124084, "learning_rate": 0.0002, "epoch": 5.9318282425852145, "step": 6700}, {"loss": 1.3208, "grad_norm": 0.9373534321784973, "learning_rate": 0.0002, "epoch": 5.940681717574148, "step": 6710}, {"loss": 1.2982, "grad_norm": 0.9750351905822754, "learning_rate": 0.0002, "epoch": 5.949535192563081, "step": 6720}, {"loss": 1.2575, "grad_norm": 0.9122727513313293, "learning_rate": 0.0002, "epoch": 5.958388667552014, "step": 6730}, {"loss": 1.2259, "grad_norm": 0.9300726652145386, "learning_rate": 0.0002, "epoch": 5.967242142540948, "step": 6740}, {"loss": 1.2634, "grad_norm": 0.972944438457489, "learning_rate": 0.0002, "epoch": 5.9760956175298805, "step": 6750}, {"loss": 1.3252, "grad_norm": 1.2385832071304321, "learning_rate": 0.0002, "epoch": 5.984949092518813, "step": 6760}, {"loss": 1.2417, "grad_norm": 0.9080338478088379, "learning_rate": 0.0002, "epoch": 5.993802567507747, "step": 6770}]} +{"epoch": 6.999557326250553, "step": 7906, "epoch_duration": 2967.0752868652344, "total_accumulated_duration": 20656.5198366642, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4916, "grad_norm": 0.4775333106517792, "learning_rate": 0.0002, "epoch": 0.008853474988933156, "step": 10}, {"loss": 2.3137, "grad_norm": 0.5485824346542358, "learning_rate": 0.0002, "epoch": 0.017706949977866312, "step": 20}, {"loss": 2.0984, "grad_norm": 0.5675218105316162, "learning_rate": 0.0002, "epoch": 0.02656042496679947, "step": 30}, {"loss": 2.0622, "grad_norm": 0.696494460105896, "learning_rate": 0.0002, "epoch": 0.035413899955732624, "step": 40}, {"loss": 1.9547, "grad_norm": 0.4788398742675781, "learning_rate": 0.0002, "epoch": 0.04426737494466578, "step": 50}, {"loss": 1.8722, "grad_norm": 0.4763128161430359, "learning_rate": 0.0002, "epoch": 0.05312084993359894, "step": 60}, {"loss": 1.8632, "grad_norm": 0.5929698348045349, "learning_rate": 0.0002, "epoch": 0.0619743249225321, "step": 70}, {"loss": 1.9573, "grad_norm": 0.5899396538734436, "learning_rate": 0.0002, "epoch": 0.07082779991146525, "step": 80}, {"loss": 1.8308, "grad_norm": 0.460123747587204, "learning_rate": 0.0002, "epoch": 0.0796812749003984, "step": 90}, {"loss": 1.7615, "grad_norm": 0.4184812009334564, "learning_rate": 0.0002, "epoch": 0.08853474988933156, "step": 100}, {"loss": 1.8079, "grad_norm": 0.4051891267299652, "learning_rate": 0.0002, "epoch": 0.09738822487826472, "step": 110}, {"loss": 1.8911, "grad_norm": 0.3709661066532135, "learning_rate": 0.0002, "epoch": 0.10624169986719788, "step": 120}, {"loss": 1.8695, "grad_norm": 0.4783487915992737, "learning_rate": 0.0002, "epoch": 0.11509517485613104, "step": 130}, {"loss": 1.8602, "grad_norm": 0.36478137969970703, "learning_rate": 0.0002, "epoch": 0.1239486498450642, "step": 140}, {"loss": 1.7814, "grad_norm": 0.4005294442176819, "learning_rate": 0.0002, "epoch": 0.13280212483399734, "step": 150}, {"loss": 1.799, "grad_norm": 0.42357513308525085, "learning_rate": 0.0002, "epoch": 0.1416555998229305, "step": 160}, {"loss": 1.8835, "grad_norm": 0.3913971781730652, "learning_rate": 0.0002, "epoch": 0.15050907481186365, "step": 170}, {"loss": 1.8507, "grad_norm": 0.4650019407272339, "learning_rate": 0.0002, "epoch": 0.1593625498007968, "step": 180}, {"loss": 1.8036, "grad_norm": 0.5545958876609802, "learning_rate": 0.0002, "epoch": 0.16821602478972997, "step": 190}, {"loss": 1.8676, "grad_norm": 0.3669356107711792, "learning_rate": 0.0002, "epoch": 0.17706949977866313, "step": 200}, {"loss": 1.8169, "grad_norm": 0.3683622181415558, "learning_rate": 0.0002, "epoch": 0.18592297476759628, "step": 210}, {"loss": 1.8117, "grad_norm": 0.39825671911239624, "learning_rate": 0.0002, "epoch": 0.19477644975652944, "step": 220}, {"loss": 1.8332, "grad_norm": 0.4298318326473236, "learning_rate": 0.0002, "epoch": 0.2036299247454626, "step": 230}, {"loss": 1.8339, "grad_norm": 0.36111244559288025, "learning_rate": 0.0002, "epoch": 0.21248339973439576, "step": 240}, {"loss": 1.78, "grad_norm": 0.3711858093738556, "learning_rate": 0.0002, "epoch": 0.2213368747233289, "step": 250}, {"loss": 1.8643, "grad_norm": 0.37717559933662415, "learning_rate": 0.0002, "epoch": 0.23019034971226207, "step": 260}, {"loss": 1.7683, "grad_norm": 0.3678877651691437, "learning_rate": 0.0002, "epoch": 0.23904382470119523, "step": 270}, {"loss": 1.8235, "grad_norm": 0.4165912866592407, "learning_rate": 0.0002, "epoch": 0.2478972996901284, "step": 280}, {"loss": 1.8033, "grad_norm": 0.3403240740299225, "learning_rate": 0.0002, "epoch": 0.25675077467906154, "step": 290}, {"loss": 1.8704, "grad_norm": 0.4023234248161316, "learning_rate": 0.0002, "epoch": 0.2656042496679947, "step": 300}, {"loss": 1.7721, "grad_norm": 0.32472360134124756, "learning_rate": 0.0002, "epoch": 0.27445772465692786, "step": 310}, {"loss": 1.8544, "grad_norm": 0.36464595794677734, "learning_rate": 0.0002, "epoch": 0.283311199645861, "step": 320}, {"loss": 1.8168, "grad_norm": 0.3868598937988281, "learning_rate": 0.0002, "epoch": 0.2921646746347942, "step": 330}, {"loss": 1.772, "grad_norm": 0.3123539686203003, "learning_rate": 0.0002, "epoch": 0.3010181496237273, "step": 340}, {"loss": 1.8285, "grad_norm": 0.3392639458179474, "learning_rate": 0.0002, "epoch": 0.3098716246126605, "step": 350}, {"loss": 1.806, "grad_norm": 0.42070651054382324, "learning_rate": 0.0002, "epoch": 0.3187250996015936, "step": 360}, {"loss": 1.8319, "grad_norm": 0.3650900423526764, "learning_rate": 0.0002, "epoch": 0.3275785745905268, "step": 370}, {"loss": 1.8388, "grad_norm": 0.41388973593711853, "learning_rate": 0.0002, "epoch": 0.33643204957945994, "step": 380}, {"loss": 1.79, "grad_norm": 0.36625272035598755, "learning_rate": 0.0002, "epoch": 0.3452855245683931, "step": 390}, {"loss": 1.8271, "grad_norm": 0.3930284082889557, "learning_rate": 0.0002, "epoch": 0.35413899955732625, "step": 400}, {"loss": 1.8664, "grad_norm": 0.3415820300579071, "learning_rate": 0.0002, "epoch": 0.3629924745462594, "step": 410}, {"loss": 1.8885, "grad_norm": 0.4256570041179657, "learning_rate": 0.0002, "epoch": 0.37184594953519257, "step": 420}, {"loss": 1.7728, "grad_norm": 0.3740842938423157, "learning_rate": 0.0002, "epoch": 0.3806994245241257, "step": 430}, {"loss": 1.7676, "grad_norm": 0.334108829498291, "learning_rate": 0.0002, "epoch": 0.3895528995130589, "step": 440}, {"loss": 1.7837, "grad_norm": 0.33186739683151245, "learning_rate": 0.0002, "epoch": 0.398406374501992, "step": 450}, {"loss": 1.8885, "grad_norm": 0.39127954840660095, "learning_rate": 0.0002, "epoch": 0.4072598494909252, "step": 460}, {"loss": 1.8053, "grad_norm": 0.331443727016449, "learning_rate": 0.0002, "epoch": 0.4161133244798583, "step": 470}, {"loss": 1.783, "grad_norm": 0.36834150552749634, "learning_rate": 0.0002, "epoch": 0.4249667994687915, "step": 480}, {"loss": 1.7549, "grad_norm": 0.338123619556427, "learning_rate": 0.0002, "epoch": 0.43382027445772464, "step": 490}, {"loss": 1.795, "grad_norm": 0.3891060948371887, "learning_rate": 0.0002, "epoch": 0.4426737494466578, "step": 500}, {"loss": 1.7639, "grad_norm": 0.3486529290676117, "learning_rate": 0.0002, "epoch": 0.45152722443559096, "step": 510}, {"loss": 1.796, "grad_norm": 0.3635135889053345, "learning_rate": 0.0002, "epoch": 0.46038069942452414, "step": 520}, {"loss": 1.8068, "grad_norm": 0.7706693410873413, "learning_rate": 0.0002, "epoch": 0.4692341744134573, "step": 530}, {"loss": 1.8048, "grad_norm": 0.33725443482398987, "learning_rate": 0.0002, "epoch": 0.47808764940239046, "step": 540}, {"loss": 1.8023, "grad_norm": 0.3127504289150238, "learning_rate": 0.0002, "epoch": 0.4869411243913236, "step": 550}, {"loss": 1.7693, "grad_norm": 0.3527977466583252, "learning_rate": 0.0002, "epoch": 0.4957945993802568, "step": 560}, {"loss": 1.7989, "grad_norm": 0.3574548661708832, "learning_rate": 0.0002, "epoch": 0.5046480743691899, "step": 570}, {"loss": 1.7699, "grad_norm": 0.32787248492240906, "learning_rate": 0.0002, "epoch": 0.5135015493581231, "step": 580}, {"loss": 1.7502, "grad_norm": 0.3309430778026581, "learning_rate": 0.0002, "epoch": 0.5223550243470563, "step": 590}, {"loss": 1.7798, "grad_norm": 0.34276407957077026, "learning_rate": 0.0002, "epoch": 0.5312084993359893, "step": 600}, {"loss": 1.7517, "grad_norm": 0.3343711495399475, "learning_rate": 0.0002, "epoch": 0.5400619743249225, "step": 610}, {"loss": 1.7661, "grad_norm": 0.3193040192127228, "learning_rate": 0.0002, "epoch": 0.5489154493138557, "step": 620}, {"loss": 1.7769, "grad_norm": 0.3059828579425812, "learning_rate": 0.0002, "epoch": 0.5577689243027888, "step": 630}, {"loss": 1.8166, "grad_norm": 0.37237173318862915, "learning_rate": 0.0002, "epoch": 0.566622399291722, "step": 640}, {"loss": 1.7531, "grad_norm": 0.36022549867630005, "learning_rate": 0.0002, "epoch": 0.5754758742806552, "step": 650}, {"loss": 1.771, "grad_norm": 0.34974920749664307, "learning_rate": 0.0002, "epoch": 0.5843293492695883, "step": 660}, {"loss": 1.8226, "grad_norm": 0.37135401368141174, "learning_rate": 0.0002, "epoch": 0.5931828242585214, "step": 670}, {"loss": 1.7456, "grad_norm": 0.3385699689388275, "learning_rate": 0.0002, "epoch": 0.6020362992474546, "step": 680}, {"loss": 1.7696, "grad_norm": 0.36015814542770386, "learning_rate": 0.0002, "epoch": 0.6108897742363878, "step": 690}, {"loss": 1.7892, "grad_norm": 0.3503795564174652, "learning_rate": 0.0002, "epoch": 0.619743249225321, "step": 700}, {"loss": 1.7733, "grad_norm": 0.3447190225124359, "learning_rate": 0.0002, "epoch": 0.628596724214254, "step": 710}, {"loss": 1.794, "grad_norm": 0.3193499445915222, "learning_rate": 0.0002, "epoch": 0.6374501992031872, "step": 720}, {"loss": 1.8046, "grad_norm": 0.37058180570602417, "learning_rate": 0.0002, "epoch": 0.6463036741921204, "step": 730}, {"loss": 1.8391, "grad_norm": 0.42216411232948303, "learning_rate": 0.0002, "epoch": 0.6551571491810536, "step": 740}, {"loss": 1.7142, "grad_norm": 0.3091185688972473, "learning_rate": 0.0002, "epoch": 0.6640106241699867, "step": 750}, {"loss": 1.8624, "grad_norm": 0.33168601989746094, "learning_rate": 0.0002, "epoch": 0.6728640991589199, "step": 760}, {"loss": 1.7123, "grad_norm": 0.31269341707229614, "learning_rate": 0.0002, "epoch": 0.6817175741478531, "step": 770}, {"loss": 1.8526, "grad_norm": 0.36125293374061584, "learning_rate": 0.0002, "epoch": 0.6905710491367862, "step": 780}, {"loss": 1.7478, "grad_norm": 0.3145293593406677, "learning_rate": 0.0002, "epoch": 0.6994245241257193, "step": 790}, {"loss": 1.6545, "grad_norm": 0.3611990809440613, "learning_rate": 0.0002, "epoch": 0.7082779991146525, "step": 800}, {"loss": 1.892, "grad_norm": 0.3165971636772156, "learning_rate": 0.0002, "epoch": 0.7171314741035857, "step": 810}, {"loss": 1.8251, "grad_norm": 0.3364323675632477, "learning_rate": 0.0002, "epoch": 0.7259849490925188, "step": 820}, {"loss": 1.8508, "grad_norm": 0.4310600757598877, "learning_rate": 0.0002, "epoch": 0.734838424081452, "step": 830}, {"loss": 1.7816, "grad_norm": 0.3414389491081238, "learning_rate": 0.0002, "epoch": 0.7436918990703851, "step": 840}, {"loss": 1.8148, "grad_norm": 0.35536202788352966, "learning_rate": 0.0002, "epoch": 0.7525453740593183, "step": 850}, {"loss": 1.8241, "grad_norm": 0.3232460618019104, "learning_rate": 0.0002, "epoch": 0.7613988490482514, "step": 860}, {"loss": 1.7312, "grad_norm": 0.32734858989715576, "learning_rate": 0.0002, "epoch": 0.7702523240371846, "step": 870}, {"loss": 1.7241, "grad_norm": 0.3433493673801422, "learning_rate": 0.0002, "epoch": 0.7791057990261178, "step": 880}, {"loss": 1.7375, "grad_norm": 0.33354780077934265, "learning_rate": 0.0002, "epoch": 0.787959274015051, "step": 890}, {"loss": 1.7314, "grad_norm": 0.30728545784950256, "learning_rate": 0.0002, "epoch": 0.796812749003984, "step": 900}, {"loss": 1.8267, "grad_norm": 0.3373030126094818, "learning_rate": 0.0002, "epoch": 0.8056662239929172, "step": 910}, {"loss": 1.8479, "grad_norm": 0.3468782603740692, "learning_rate": 0.0002, "epoch": 0.8145196989818504, "step": 920}, {"loss": 1.8548, "grad_norm": 0.33520200848579407, "learning_rate": 0.0002, "epoch": 0.8233731739707836, "step": 930}, {"loss": 1.7932, "grad_norm": 0.35207098722457886, "learning_rate": 0.0002, "epoch": 0.8322266489597167, "step": 940}, {"loss": 1.7804, "grad_norm": 0.4000207483768463, "learning_rate": 0.0002, "epoch": 0.8410801239486498, "step": 950}, {"loss": 1.7996, "grad_norm": 0.35362836718559265, "learning_rate": 0.0002, "epoch": 0.849933598937583, "step": 960}, {"loss": 1.7497, "grad_norm": 0.3470745086669922, "learning_rate": 0.0002, "epoch": 0.8587870739265162, "step": 970}, {"loss": 1.8174, "grad_norm": 0.31602704524993896, "learning_rate": 0.0002, "epoch": 0.8676405489154493, "step": 980}, {"loss": 1.7734, "grad_norm": 0.3062942326068878, "learning_rate": 0.0002, "epoch": 0.8764940239043825, "step": 990}, {"loss": 1.7804, "grad_norm": 0.36963850259780884, "learning_rate": 0.0002, "epoch": 0.8853474988933157, "step": 1000}, {"loss": 1.7309, "grad_norm": 0.3384034037590027, "learning_rate": 0.0002, "epoch": 0.8942009738822487, "step": 1010}, {"loss": 1.7945, "grad_norm": 0.30436110496520996, "learning_rate": 0.0002, "epoch": 0.9030544488711819, "step": 1020}, {"loss": 1.7126, "grad_norm": 3.499784469604492, "learning_rate": 0.0002, "epoch": 0.9119079238601151, "step": 1030}, {"loss": 1.7847, "grad_norm": 0.3130280375480652, "learning_rate": 0.0002, "epoch": 0.9207613988490483, "step": 1040}, {"loss": 1.7527, "grad_norm": 0.29976674914360046, "learning_rate": 0.0002, "epoch": 0.9296148738379814, "step": 1050}, {"loss": 1.7753, "grad_norm": 0.35852617025375366, "learning_rate": 0.0002, "epoch": 0.9384683488269145, "step": 1060}, {"loss": 1.7507, "grad_norm": 0.3288591504096985, "learning_rate": 0.0002, "epoch": 0.9473218238158477, "step": 1070}, {"loss": 1.8155, "grad_norm": 0.32641634345054626, "learning_rate": 0.0002, "epoch": 0.9561752988047809, "step": 1080}, {"loss": 1.7912, "grad_norm": 0.3305715322494507, "learning_rate": 0.0002, "epoch": 0.965028773793714, "step": 1090}, {"loss": 1.8368, "grad_norm": 0.30650773644447327, "learning_rate": 0.0002, "epoch": 0.9738822487826472, "step": 1100}, {"loss": 1.6739, "grad_norm": 0.3330624997615814, "learning_rate": 0.0002, "epoch": 0.9827357237715804, "step": 1110}, {"loss": 1.8392, "grad_norm": 0.3173314034938812, "learning_rate": 0.0002, "epoch": 0.9915891987605135, "step": 1120}, {"eval_loss": 1.8095673322677612, "eval_runtime": 82.6312, "eval_samples_per_second": 6.233, "eval_steps_per_second": 0.787, "epoch": 0.9995573262505534, "step": 1129}, {"loss": 1.7997, "grad_norm": 0.3092995882034302, "learning_rate": 0.0002, "epoch": 1.0004426737494467, "step": 1130}, {"loss": 1.6958, "grad_norm": 0.34386494755744934, "learning_rate": 0.0002, "epoch": 1.0092961487383798, "step": 1140}, {"loss": 1.7149, "grad_norm": 0.2887897789478302, "learning_rate": 0.0002, "epoch": 1.0181496237273129, "step": 1150}, {"loss": 1.7377, "grad_norm": 0.3706893026828766, "learning_rate": 0.0002, "epoch": 1.0270030987162462, "step": 1160}, {"loss": 1.6604, "grad_norm": 0.34724316000938416, "learning_rate": 0.0002, "epoch": 1.0358565737051793, "step": 1170}, {"loss": 1.7749, "grad_norm": 0.41001757979393005, "learning_rate": 0.0002, "epoch": 1.0447100486941125, "step": 1180}, {"loss": 1.6332, "grad_norm": 0.34838348627090454, "learning_rate": 0.0002, "epoch": 1.0535635236830456, "step": 1190}, {"loss": 1.7416, "grad_norm": 0.37201181054115295, "learning_rate": 0.0002, "epoch": 1.0624169986719787, "step": 1200}, {"loss": 1.7707, "grad_norm": 0.36871352791786194, "learning_rate": 0.0002, "epoch": 1.071270473660912, "step": 1210}, {"loss": 1.6769, "grad_norm": 0.35687458515167236, "learning_rate": 0.0002, "epoch": 1.080123948649845, "step": 1220}, {"loss": 1.7235, "grad_norm": 0.3864741921424866, "learning_rate": 0.0002, "epoch": 1.0889774236387781, "step": 1230}, {"loss": 1.729, "grad_norm": 0.3496808707714081, "learning_rate": 0.0002, "epoch": 1.0978308986277114, "step": 1240}, {"loss": 1.7192, "grad_norm": 0.3444930911064148, "learning_rate": 0.0002, "epoch": 1.1066843736166445, "step": 1250}, {"loss": 1.6672, "grad_norm": 0.353188693523407, "learning_rate": 0.0002, "epoch": 1.1155378486055776, "step": 1260}, {"loss": 1.7634, "grad_norm": 0.3284400999546051, "learning_rate": 0.0002, "epoch": 1.1243913235945109, "step": 1270}, {"loss": 1.7441, "grad_norm": 0.3545348644256592, "learning_rate": 0.0002, "epoch": 1.133244798583444, "step": 1280}, {"loss": 1.7343, "grad_norm": 0.3489900529384613, "learning_rate": 0.0002, "epoch": 1.1420982735723773, "step": 1290}, {"loss": 1.6399, "grad_norm": 0.40355560183525085, "learning_rate": 0.0002, "epoch": 1.1509517485613103, "step": 1300}, {"loss": 1.7658, "grad_norm": 0.3369944095611572, "learning_rate": 0.0002, "epoch": 1.1598052235502434, "step": 1310}, {"loss": 1.7098, "grad_norm": 0.39141345024108887, "learning_rate": 0.0002, "epoch": 1.1686586985391767, "step": 1320}, {"loss": 1.6628, "grad_norm": 0.36518552899360657, "learning_rate": 0.0002, "epoch": 1.1775121735281098, "step": 1330}, {"loss": 1.6958, "grad_norm": 0.3730056583881378, "learning_rate": 0.0002, "epoch": 1.1863656485170428, "step": 1340}, {"loss": 1.7613, "grad_norm": 0.37711501121520996, "learning_rate": 0.0002, "epoch": 1.1952191235059761, "step": 1350}, {"loss": 1.6423, "grad_norm": 0.3627128005027771, "learning_rate": 0.0002, "epoch": 1.2040725984949092, "step": 1360}, {"loss": 1.7214, "grad_norm": 0.3458651006221771, "learning_rate": 0.0002, "epoch": 1.2129260734838425, "step": 1370}, {"loss": 1.6978, "grad_norm": 0.392395555973053, "learning_rate": 0.0002, "epoch": 1.2217795484727756, "step": 1380}, {"loss": 1.7785, "grad_norm": 0.3353286683559418, "learning_rate": 0.0002, "epoch": 1.2306330234617087, "step": 1390}, {"loss": 1.7019, "grad_norm": 0.9545007944107056, "learning_rate": 0.0002, "epoch": 1.239486498450642, "step": 1400}, {"loss": 1.725, "grad_norm": 0.37037935853004456, "learning_rate": 0.0002, "epoch": 1.248339973439575, "step": 1410}, {"loss": 1.6818, "grad_norm": 0.3831497132778168, "learning_rate": 0.0002, "epoch": 1.257193448428508, "step": 1420}, {"loss": 1.747, "grad_norm": 0.4633576273918152, "learning_rate": 0.0002, "epoch": 1.2660469234174414, "step": 1430}, {"loss": 1.6864, "grad_norm": 0.3690567910671234, "learning_rate": 0.0002, "epoch": 1.2749003984063745, "step": 1440}, {"loss": 1.767, "grad_norm": 0.33980098366737366, "learning_rate": 0.0002, "epoch": 1.2837538733953076, "step": 1450}, {"loss": 1.6989, "grad_norm": 0.3731277287006378, "learning_rate": 0.0002, "epoch": 1.2926073483842409, "step": 1460}, {"loss": 1.6801, "grad_norm": 0.3781551122665405, "learning_rate": 0.0002, "epoch": 1.301460823373174, "step": 1470}, {"loss": 1.7551, "grad_norm": 0.36511561274528503, "learning_rate": 0.0002, "epoch": 1.310314298362107, "step": 1480}, {"loss": 1.6629, "grad_norm": 0.3292245864868164, "learning_rate": 0.0002, "epoch": 1.3191677733510403, "step": 1490}, {"loss": 1.7098, "grad_norm": 0.38758566975593567, "learning_rate": 0.0002, "epoch": 1.3280212483399734, "step": 1500}, {"loss": 1.7364, "grad_norm": 0.3993414044380188, "learning_rate": 0.0002, "epoch": 1.3368747233289067, "step": 1510}, {"loss": 1.7202, "grad_norm": 0.35689303278923035, "learning_rate": 0.0002, "epoch": 1.3457281983178397, "step": 1520}, {"loss": 1.7082, "grad_norm": 0.41849321126937866, "learning_rate": 0.0002, "epoch": 1.354581673306773, "step": 1530}, {"loss": 1.7488, "grad_norm": 0.36752554774284363, "learning_rate": 0.0002, "epoch": 1.3634351482957061, "step": 1540}, {"loss": 1.7032, "grad_norm": 0.36915940046310425, "learning_rate": 0.0002, "epoch": 1.3722886232846392, "step": 1550}, {"loss": 1.6698, "grad_norm": 0.3656710386276245, "learning_rate": 0.0002, "epoch": 1.3811420982735725, "step": 1560}, {"loss": 1.7269, "grad_norm": 0.32055532932281494, "learning_rate": 0.0002, "epoch": 1.3899955732625056, "step": 1570}, {"loss": 1.8, "grad_norm": 0.35031241178512573, "learning_rate": 0.0002, "epoch": 1.3988490482514386, "step": 1580}, {"loss": 1.6667, "grad_norm": 0.44541189074516296, "learning_rate": 0.0002, "epoch": 1.407702523240372, "step": 1590}, {"loss": 1.8624, "grad_norm": 0.36922356486320496, "learning_rate": 0.0002, "epoch": 1.416555998229305, "step": 1600}, {"loss": 1.7011, "grad_norm": 0.3470565974712372, "learning_rate": 0.0002, "epoch": 1.425409473218238, "step": 1610}, {"loss": 1.6912, "grad_norm": 0.3743111193180084, "learning_rate": 0.0002, "epoch": 1.4342629482071714, "step": 1620}, {"loss": 1.752, "grad_norm": 0.3619250953197479, "learning_rate": 0.0002, "epoch": 1.4431164231961044, "step": 1630}, {"loss": 1.6919, "grad_norm": 0.4028145968914032, "learning_rate": 0.0002, "epoch": 1.4519698981850375, "step": 1640}, {"loss": 1.75, "grad_norm": 0.36065351963043213, "learning_rate": 0.0002, "epoch": 1.4608233731739708, "step": 1650}, {"loss": 1.8212, "grad_norm": 0.44304442405700684, "learning_rate": 0.0002, "epoch": 1.469676848162904, "step": 1660}, {"loss": 1.6691, "grad_norm": 0.35770007967948914, "learning_rate": 0.0002, "epoch": 1.478530323151837, "step": 1670}, {"loss": 1.7588, "grad_norm": 0.37584400177001953, "learning_rate": 0.0002, "epoch": 1.4873837981407703, "step": 1680}, {"loss": 1.63, "grad_norm": 0.37151241302490234, "learning_rate": 0.0002, "epoch": 1.4962372731297033, "step": 1690}, {"loss": 1.6675, "grad_norm": 0.36422812938690186, "learning_rate": 0.0002, "epoch": 1.5050907481186364, "step": 1700}, {"loss": 1.7045, "grad_norm": 0.3680015206336975, "learning_rate": 0.0002, "epoch": 1.5139442231075697, "step": 1710}, {"loss": 1.6917, "grad_norm": 0.3356926441192627, "learning_rate": 0.0002, "epoch": 1.522797698096503, "step": 1720}, {"loss": 1.7108, "grad_norm": 0.37887054681777954, "learning_rate": 0.0002, "epoch": 1.531651173085436, "step": 1730}, {"loss": 1.7001, "grad_norm": 0.37052762508392334, "learning_rate": 0.0002, "epoch": 1.5405046480743692, "step": 1740}, {"loss": 1.6677, "grad_norm": 0.333925724029541, "learning_rate": 0.0002, "epoch": 1.5493581230633025, "step": 1750}, {"loss": 1.7159, "grad_norm": 0.3722778558731079, "learning_rate": 0.0002, "epoch": 1.5582115980522355, "step": 1760}, {"loss": 1.6923, "grad_norm": 0.3331141173839569, "learning_rate": 0.0002, "epoch": 1.5670650730411686, "step": 1770}, {"loss": 1.7444, "grad_norm": 0.3670045733451843, "learning_rate": 0.0002, "epoch": 1.575918548030102, "step": 1780}, {"loss": 1.7092, "grad_norm": 0.3769885301589966, "learning_rate": 0.0002, "epoch": 1.584772023019035, "step": 1790}, {"loss": 1.6689, "grad_norm": 0.4266890287399292, "learning_rate": 0.0002, "epoch": 1.593625498007968, "step": 1800}, {"loss": 1.6859, "grad_norm": 0.37174347043037415, "learning_rate": 0.0002, "epoch": 1.6024789729969013, "step": 1810}, {"loss": 1.6793, "grad_norm": 0.3599846363067627, "learning_rate": 0.0002, "epoch": 1.6113324479858344, "step": 1820}, {"loss": 1.6836, "grad_norm": 0.3364820182323456, "learning_rate": 0.0002, "epoch": 1.6201859229747675, "step": 1830}, {"loss": 1.7278, "grad_norm": 0.3874799907207489, "learning_rate": 0.0002, "epoch": 1.6290393979637008, "step": 1840}, {"loss": 1.705, "grad_norm": 0.3706085681915283, "learning_rate": 0.0002, "epoch": 1.6378928729526339, "step": 1850}, {"loss": 1.6761, "grad_norm": 0.3997809886932373, "learning_rate": 0.0002, "epoch": 1.646746347941567, "step": 1860}, {"loss": 1.7983, "grad_norm": 0.4033166170120239, "learning_rate": 0.0002, "epoch": 1.6555998229305002, "step": 1870}, {"loss": 1.6518, "grad_norm": 0.3944370150566101, "learning_rate": 0.0002, "epoch": 1.6644532979194335, "step": 1880}, {"loss": 1.6017, "grad_norm": 0.3467825651168823, "learning_rate": 0.0002, "epoch": 1.6733067729083664, "step": 1890}, {"loss": 1.7462, "grad_norm": 0.35290950536727905, "learning_rate": 0.0002, "epoch": 1.6821602478972997, "step": 1900}, {"loss": 1.7634, "grad_norm": 0.3664521872997284, "learning_rate": 0.0002, "epoch": 1.691013722886233, "step": 1910}, {"loss": 1.7922, "grad_norm": 0.33863595128059387, "learning_rate": 0.0002, "epoch": 1.699867197875166, "step": 1920}, {"loss": 1.7048, "grad_norm": 0.34726113080978394, "learning_rate": 0.0002, "epoch": 1.7087206728640991, "step": 1930}, {"loss": 1.6664, "grad_norm": 0.35060688853263855, "learning_rate": 0.0002, "epoch": 1.7175741478530324, "step": 1940}, {"loss": 1.7577, "grad_norm": 0.33741647005081177, "learning_rate": 0.0002, "epoch": 1.7264276228419655, "step": 1950}, {"loss": 1.6971, "grad_norm": 0.36190304160118103, "learning_rate": 0.0002, "epoch": 1.7352810978308986, "step": 1960}, {"loss": 1.7238, "grad_norm": 0.3412845730781555, "learning_rate": 0.0002, "epoch": 1.7441345728198319, "step": 1970}, {"loss": 1.7038, "grad_norm": 0.3841935694217682, "learning_rate": 0.0002, "epoch": 1.752988047808765, "step": 1980}, {"loss": 1.7185, "grad_norm": 0.39062076807022095, "learning_rate": 0.0002, "epoch": 1.761841522797698, "step": 1990}, {"loss": 1.7346, "grad_norm": 0.3741697669029236, "learning_rate": 0.0002, "epoch": 1.7706949977866313, "step": 2000}, {"loss": 1.6864, "grad_norm": 0.4160231053829193, "learning_rate": 0.0002, "epoch": 1.7795484727755644, "step": 2010}, {"loss": 1.7572, "grad_norm": 0.3602111339569092, "learning_rate": 0.0002, "epoch": 1.7884019477644975, "step": 2020}, {"loss": 1.6139, "grad_norm": 0.36740878224372864, "learning_rate": 0.0002, "epoch": 1.7972554227534308, "step": 2030}, {"loss": 1.7043, "grad_norm": 0.419039249420166, "learning_rate": 0.0002, "epoch": 1.8061088977423638, "step": 2040}, {"loss": 1.7847, "grad_norm": 0.3511838912963867, "learning_rate": 0.0002, "epoch": 1.814962372731297, "step": 2050}, {"loss": 1.6477, "grad_norm": 0.3580166697502136, "learning_rate": 0.0002, "epoch": 1.8238158477202302, "step": 2060}, {"loss": 1.7562, "grad_norm": 0.40928223729133606, "learning_rate": 0.0002, "epoch": 1.8326693227091635, "step": 2070}, {"loss": 1.7356, "grad_norm": 0.37134310603141785, "learning_rate": 0.0002, "epoch": 1.8415227976980963, "step": 2080}, {"loss": 1.6829, "grad_norm": 0.3924112319946289, "learning_rate": 0.0002, "epoch": 1.8503762726870296, "step": 2090}, {"loss": 1.6785, "grad_norm": 0.3215042054653168, "learning_rate": 0.0002, "epoch": 1.859229747675963, "step": 2100}, {"loss": 1.6864, "grad_norm": 0.37674015760421753, "learning_rate": 0.0002, "epoch": 1.868083222664896, "step": 2110}, {"loss": 1.7313, "grad_norm": 0.370856374502182, "learning_rate": 0.0002, "epoch": 1.876936697653829, "step": 2120}, {"loss": 1.7163, "grad_norm": 0.35783782601356506, "learning_rate": 0.0002, "epoch": 1.8857901726427624, "step": 2130}, {"loss": 1.7655, "grad_norm": 0.39538058638572693, "learning_rate": 0.0002, "epoch": 1.8946436476316955, "step": 2140}, {"loss": 1.6614, "grad_norm": 0.36677780747413635, "learning_rate": 0.0002, "epoch": 1.9034971226206285, "step": 2150}, {"loss": 1.6959, "grad_norm": 0.39032700657844543, "learning_rate": 0.0002, "epoch": 1.9123505976095618, "step": 2160}, {"loss": 1.7643, "grad_norm": 0.39762043952941895, "learning_rate": 0.0002, "epoch": 1.921204072598495, "step": 2170}, {"loss": 1.6767, "grad_norm": 0.5400257110595703, "learning_rate": 0.0002, "epoch": 1.930057547587428, "step": 2180}, {"loss": 1.7262, "grad_norm": 0.3650212287902832, "learning_rate": 0.0002, "epoch": 1.9389110225763613, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3583165109157562, "learning_rate": 0.0002, "epoch": 1.9477644975652944, "step": 2200}, {"loss": 1.7241, "grad_norm": 0.4031282365322113, "learning_rate": 0.0002, "epoch": 1.9566179725542274, "step": 2210}, {"loss": 1.7617, "grad_norm": 0.3673221170902252, "learning_rate": 0.0002, "epoch": 1.9654714475431607, "step": 2220}, {"loss": 1.6862, "grad_norm": 0.3920327126979828, "learning_rate": 0.0002, "epoch": 1.9743249225320938, "step": 2230}, {"loss": 1.7192, "grad_norm": 0.4765491783618927, "learning_rate": 0.0002, "epoch": 1.9831783975210269, "step": 2240}, {"loss": 1.7759, "grad_norm": 0.38130584359169006, "learning_rate": 0.0002, "epoch": 1.9920318725099602, "step": 2250}, {"eval_loss": 1.8077166080474854, "eval_runtime": 82.8351, "eval_samples_per_second": 6.217, "eval_steps_per_second": 0.785, "epoch": 2.0, "step": 2259}, {"loss": 1.7081, "grad_norm": 0.34340235590934753, "learning_rate": 0.0002, "epoch": 2.0008853474988935, "step": 2260}, {"loss": 1.6815, "grad_norm": 0.3710762858390808, "learning_rate": 0.0002, "epoch": 2.0097388224878263, "step": 2270}, {"loss": 1.5828, "grad_norm": 0.35640114545822144, "learning_rate": 0.0002, "epoch": 2.0185922974767596, "step": 2280}, {"loss": 1.6322, "grad_norm": 0.45970189571380615, "learning_rate": 0.0002, "epoch": 2.027445772465693, "step": 2290}, {"loss": 1.5598, "grad_norm": 0.4256797134876251, "learning_rate": 0.0002, "epoch": 2.0362992474546258, "step": 2300}, {"loss": 1.6271, "grad_norm": 0.42421531677246094, "learning_rate": 0.0002, "epoch": 2.045152722443559, "step": 2310}, {"loss": 1.6117, "grad_norm": 0.4032478928565979, "learning_rate": 0.0002, "epoch": 2.0540061974324924, "step": 2320}, {"loss": 1.6389, "grad_norm": 0.4073623716831207, "learning_rate": 0.0002, "epoch": 2.062859672421425, "step": 2330}, {"loss": 1.6527, "grad_norm": 0.4845200777053833, "learning_rate": 0.0002, "epoch": 2.0717131474103585, "step": 2340}, {"loss": 1.5734, "grad_norm": 0.40578293800354004, "learning_rate": 0.0002, "epoch": 2.080566622399292, "step": 2350}, {"loss": 1.5853, "grad_norm": 0.4037284255027771, "learning_rate": 0.0002, "epoch": 2.089420097388225, "step": 2360}, {"loss": 1.6511, "grad_norm": 0.4717613160610199, "learning_rate": 0.0002, "epoch": 2.098273572377158, "step": 2370}, {"loss": 1.6273, "grad_norm": 0.42076411843299866, "learning_rate": 0.0002, "epoch": 2.1071270473660912, "step": 2380}, {"loss": 1.654, "grad_norm": 0.47799113392829895, "learning_rate": 0.0002, "epoch": 2.1159805223550245, "step": 2390}, {"loss": 1.5528, "grad_norm": 0.4253084063529968, "learning_rate": 0.0002, "epoch": 2.1248339973439574, "step": 2400}, {"loss": 1.6432, "grad_norm": 0.5023085474967957, "learning_rate": 0.0002, "epoch": 2.1336874723328907, "step": 2410}, {"loss": 1.5926, "grad_norm": 0.49162712693214417, "learning_rate": 0.0002, "epoch": 2.142540947321824, "step": 2420}, {"loss": 1.5779, "grad_norm": 0.39035019278526306, "learning_rate": 0.0002, "epoch": 2.151394422310757, "step": 2430}, {"loss": 1.7526, "grad_norm": 0.43223854899406433, "learning_rate": 0.0002, "epoch": 2.16024789729969, "step": 2440}, {"loss": 1.6334, "grad_norm": 0.4596616327762604, "learning_rate": 0.0002, "epoch": 2.1691013722886234, "step": 2450}, {"loss": 1.6067, "grad_norm": 0.4469447731971741, "learning_rate": 0.0002, "epoch": 2.1779548472775563, "step": 2460}, {"loss": 1.5806, "grad_norm": 0.5100595355033875, "learning_rate": 0.0002, "epoch": 2.1868083222664896, "step": 2470}, {"loss": 1.6456, "grad_norm": 0.4169430732727051, "learning_rate": 0.0002, "epoch": 2.195661797255423, "step": 2480}, {"loss": 1.6734, "grad_norm": 0.4699254035949707, "learning_rate": 0.0002, "epoch": 2.2045152722443557, "step": 2490}, {"loss": 1.6259, "grad_norm": 0.43524250388145447, "learning_rate": 0.0002, "epoch": 2.213368747233289, "step": 2500}, {"loss": 1.6717, "grad_norm": 0.4496648907661438, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2510}, {"loss": 1.6735, "grad_norm": 0.43408212065696716, "learning_rate": 0.0002, "epoch": 2.231075697211155, "step": 2520}, {"loss": 1.611, "grad_norm": 0.4596034288406372, "learning_rate": 0.0002, "epoch": 2.2399291722000885, "step": 2530}, {"loss": 1.6271, "grad_norm": 0.5217021107673645, "learning_rate": 0.0002, "epoch": 2.2487826471890218, "step": 2540}, {"loss": 1.6027, "grad_norm": 0.44745638966560364, "learning_rate": 0.0002, "epoch": 2.2576361221779546, "step": 2550}, {"loss": 1.675, "grad_norm": 0.4484798014163971, "learning_rate": 0.0002, "epoch": 2.266489597166888, "step": 2560}, {"loss": 1.5321, "grad_norm": 0.4428067207336426, "learning_rate": 0.0002, "epoch": 2.275343072155821, "step": 2570}, {"loss": 1.6716, "grad_norm": 0.5095171332359314, "learning_rate": 0.0002, "epoch": 2.2841965471447545, "step": 2580}, {"loss": 1.5661, "grad_norm": 0.44833096861839294, "learning_rate": 0.0002, "epoch": 2.2930500221336874, "step": 2590}, {"loss": 1.652, "grad_norm": 0.507905900478363, "learning_rate": 0.0002, "epoch": 2.3019034971226207, "step": 2600}, {"loss": 1.5963, "grad_norm": 0.40808171033859253, "learning_rate": 0.0002, "epoch": 2.310756972111554, "step": 2610}, {"loss": 1.6574, "grad_norm": 0.4684814214706421, "learning_rate": 0.0002, "epoch": 2.319610447100487, "step": 2620}, {"loss": 1.587, "grad_norm": 0.44864922761917114, "learning_rate": 0.0002, "epoch": 2.32846392208942, "step": 2630}, {"loss": 1.5828, "grad_norm": 0.4174162745475769, "learning_rate": 0.0002, "epoch": 2.3373173970783534, "step": 2640}, {"loss": 1.642, "grad_norm": 0.42314743995666504, "learning_rate": 0.0002, "epoch": 2.3461708720672863, "step": 2650}, {"loss": 1.5884, "grad_norm": 0.49224185943603516, "learning_rate": 0.0002, "epoch": 2.3550243470562195, "step": 2660}, {"loss": 1.5766, "grad_norm": 0.45190292596817017, "learning_rate": 0.0002, "epoch": 2.363877822045153, "step": 2670}, {"loss": 1.6284, "grad_norm": 0.41817107796669006, "learning_rate": 0.0002, "epoch": 2.3727312970340857, "step": 2680}, {"loss": 1.6356, "grad_norm": 0.6436763405799866, "learning_rate": 0.0002, "epoch": 2.381584772023019, "step": 2690}, {"loss": 1.5915, "grad_norm": 0.47175949811935425, "learning_rate": 0.0002, "epoch": 2.3904382470119523, "step": 2700}, {"loss": 1.6303, "grad_norm": 0.480339378118515, "learning_rate": 0.0002, "epoch": 2.3992917220008856, "step": 2710}, {"loss": 1.5697, "grad_norm": 0.4723486006259918, "learning_rate": 0.0002, "epoch": 2.4081451969898184, "step": 2720}, {"loss": 1.54, "grad_norm": 0.4305492043495178, "learning_rate": 0.0002, "epoch": 2.4169986719787517, "step": 2730}, {"loss": 1.71, "grad_norm": 0.5007492303848267, "learning_rate": 0.0002, "epoch": 2.425852146967685, "step": 2740}, {"loss": 1.5369, "grad_norm": 0.5374062061309814, "learning_rate": 0.0002, "epoch": 2.434705621956618, "step": 2750}, {"loss": 1.6156, "grad_norm": 0.45866212248802185, "learning_rate": 0.0002, "epoch": 2.443559096945551, "step": 2760}, {"loss": 1.6066, "grad_norm": 0.47914502024650574, "learning_rate": 0.0002, "epoch": 2.4524125719344845, "step": 2770}, {"loss": 1.5644, "grad_norm": 0.43804746866226196, "learning_rate": 0.0002, "epoch": 2.4612660469234173, "step": 2780}, {"loss": 1.5952, "grad_norm": 0.43656906485557556, "learning_rate": 0.0002, "epoch": 2.4701195219123506, "step": 2790}, {"loss": 1.6311, "grad_norm": 0.4820363521575928, "learning_rate": 0.0002, "epoch": 2.478972996901284, "step": 2800}, {"loss": 1.5375, "grad_norm": 0.4916800558567047, "learning_rate": 0.0002, "epoch": 2.4878264718902168, "step": 2810}, {"loss": 1.5736, "grad_norm": 0.4521256983280182, "learning_rate": 0.0002, "epoch": 2.49667994687915, "step": 2820}, {"loss": 1.6179, "grad_norm": 0.5066806674003601, "learning_rate": 0.0002, "epoch": 2.5055334218680834, "step": 2830}, {"loss": 1.5812, "grad_norm": 0.4768151640892029, "learning_rate": 0.0002, "epoch": 2.514386896857016, "step": 2840}, {"loss": 1.6719, "grad_norm": 0.5144683718681335, "learning_rate": 0.0002, "epoch": 2.5232403718459495, "step": 2850}, {"loss": 1.6063, "grad_norm": 0.4718942940235138, "learning_rate": 0.0002, "epoch": 2.532093846834883, "step": 2860}, {"loss": 1.6099, "grad_norm": 0.4924587309360504, "learning_rate": 0.0002, "epoch": 2.5409473218238157, "step": 2870}, {"loss": 1.5994, "grad_norm": 0.4649953842163086, "learning_rate": 0.0002, "epoch": 2.549800796812749, "step": 2880}, {"loss": 1.6501, "grad_norm": 0.4836665987968445, "learning_rate": 0.0002, "epoch": 2.5586542718016823, "step": 2890}, {"loss": 1.6518, "grad_norm": 0.4162124991416931, "learning_rate": 0.0002, "epoch": 2.567507746790615, "step": 2900}, {"loss": 1.6471, "grad_norm": 0.4894537925720215, "learning_rate": 0.0002, "epoch": 2.5763612217795484, "step": 2910}, {"loss": 1.6123, "grad_norm": 0.4539397358894348, "learning_rate": 0.0002, "epoch": 2.5852146967684817, "step": 2920}, {"loss": 1.6449, "grad_norm": 0.4718773066997528, "learning_rate": 0.0002, "epoch": 2.5940681717574146, "step": 2930}, {"loss": 1.584, "grad_norm": 0.49989837408065796, "learning_rate": 0.0002, "epoch": 2.602921646746348, "step": 2940}, {"loss": 1.6087, "grad_norm": 0.4862406849861145, "learning_rate": 0.0002, "epoch": 2.611775121735281, "step": 2950}, {"loss": 1.6057, "grad_norm": 0.4244804382324219, "learning_rate": 0.0002, "epoch": 2.620628596724214, "step": 2960}, {"loss": 1.7795, "grad_norm": 0.49304354190826416, "learning_rate": 0.0002, "epoch": 2.6294820717131473, "step": 2970}, {"loss": 1.7255, "grad_norm": 0.4818236529827118, "learning_rate": 0.0002, "epoch": 2.6383355467020806, "step": 2980}, {"loss": 1.621, "grad_norm": 0.5077425837516785, "learning_rate": 0.0002, "epoch": 2.647189021691014, "step": 2990}, {"loss": 1.7064, "grad_norm": 0.4494157135486603, "learning_rate": 0.0002, "epoch": 2.6560424966799467, "step": 3000}, {"loss": 1.6792, "grad_norm": 0.4790278971195221, "learning_rate": 0.0002, "epoch": 2.66489597166888, "step": 3010}, {"loss": 1.6082, "grad_norm": 0.4702624976634979, "learning_rate": 0.0002, "epoch": 2.6737494466578133, "step": 3020}, {"loss": 1.6494, "grad_norm": 0.5082133412361145, "learning_rate": 0.0002, "epoch": 2.682602921646746, "step": 3030}, {"loss": 1.6438, "grad_norm": 0.4553256630897522, "learning_rate": 0.0002, "epoch": 2.6914563966356795, "step": 3040}, {"loss": 1.6155, "grad_norm": 0.4492715001106262, "learning_rate": 0.0002, "epoch": 2.700309871624613, "step": 3050}, {"loss": 1.5367, "grad_norm": 0.4555944502353668, "learning_rate": 0.0002, "epoch": 2.709163346613546, "step": 3060}, {"loss": 1.5793, "grad_norm": 0.5879693031311035, "learning_rate": 0.0002, "epoch": 2.718016821602479, "step": 3070}, {"loss": 1.6357, "grad_norm": 0.4628562927246094, "learning_rate": 0.0002, "epoch": 2.7268702965914122, "step": 3080}, {"loss": 1.6585, "grad_norm": 0.5169575810432434, "learning_rate": 0.0002, "epoch": 2.7357237715803455, "step": 3090}, {"loss": 1.562, "grad_norm": 0.4630090892314911, "learning_rate": 0.0002, "epoch": 2.7445772465692784, "step": 3100}, {"loss": 1.5508, "grad_norm": 0.5437219738960266, "learning_rate": 0.0002, "epoch": 2.7534307215582117, "step": 3110}, {"loss": 1.6442, "grad_norm": 0.5102152228355408, "learning_rate": 0.0002, "epoch": 2.762284196547145, "step": 3120}, {"loss": 1.5448, "grad_norm": 0.48287826776504517, "learning_rate": 0.0002, "epoch": 2.771137671536078, "step": 3130}, {"loss": 1.6657, "grad_norm": 0.4671737253665924, "learning_rate": 0.0002, "epoch": 2.779991146525011, "step": 3140}, {"loss": 1.5864, "grad_norm": 0.5177035331726074, "learning_rate": 0.0002, "epoch": 2.7888446215139444, "step": 3150}, {"loss": 1.5617, "grad_norm": 0.450989305973053, "learning_rate": 0.0002, "epoch": 2.7976980965028773, "step": 3160}, {"loss": 1.597, "grad_norm": 0.45007848739624023, "learning_rate": 0.0002, "epoch": 2.8065515714918106, "step": 3170}, {"loss": 1.7179, "grad_norm": 0.4600294530391693, "learning_rate": 0.0002, "epoch": 2.815405046480744, "step": 3180}, {"loss": 1.6441, "grad_norm": 0.485628604888916, "learning_rate": 0.0002, "epoch": 2.8242585214696767, "step": 3190}, {"loss": 1.6396, "grad_norm": 0.49811574816703796, "learning_rate": 0.0002, "epoch": 2.83311199645861, "step": 3200}, {"loss": 1.6067, "grad_norm": 0.5012516975402832, "learning_rate": 0.0002, "epoch": 2.8419654714475433, "step": 3210}, {"loss": 1.6188, "grad_norm": 0.4552757740020752, "learning_rate": 0.0002, "epoch": 2.850818946436476, "step": 3220}, {"loss": 1.5993, "grad_norm": 0.4539635479450226, "learning_rate": 0.0002, "epoch": 2.8596724214254094, "step": 3230}, {"loss": 1.5957, "grad_norm": 0.5534685850143433, "learning_rate": 0.0002, "epoch": 2.8685258964143427, "step": 3240}, {"loss": 1.6065, "grad_norm": 0.4570811688899994, "learning_rate": 0.0002, "epoch": 2.8773793714032756, "step": 3250}, {"loss": 1.6016, "grad_norm": 0.48181653022766113, "learning_rate": 0.0002, "epoch": 2.886232846392209, "step": 3260}, {"loss": 1.6574, "grad_norm": 0.4871032238006592, "learning_rate": 0.0002, "epoch": 2.895086321381142, "step": 3270}, {"loss": 1.5626, "grad_norm": 0.4643239676952362, "learning_rate": 0.0002, "epoch": 2.903939796370075, "step": 3280}, {"loss": 1.5981, "grad_norm": 0.5024484395980835, "learning_rate": 0.0002, "epoch": 2.9127932713590083, "step": 3290}, {"loss": 1.5756, "grad_norm": 0.4425384998321533, "learning_rate": 0.0002, "epoch": 2.9216467463479416, "step": 3300}, {"loss": 1.644, "grad_norm": 0.459168016910553, "learning_rate": 0.0002, "epoch": 2.9305002213368745, "step": 3310}, {"loss": 1.6404, "grad_norm": 0.4950717091560364, "learning_rate": 0.0002, "epoch": 2.939353696325808, "step": 3320}, {"loss": 1.652, "grad_norm": 0.4516230523586273, "learning_rate": 0.0002, "epoch": 2.948207171314741, "step": 3330}, {"loss": 1.5917, "grad_norm": 0.49523285031318665, "learning_rate": 0.0002, "epoch": 2.957060646303674, "step": 3340}, {"loss": 1.733, "grad_norm": 0.49282631278038025, "learning_rate": 0.0002, "epoch": 2.9659141212926072, "step": 3350}, {"loss": 1.6519, "grad_norm": 0.45825016498565674, "learning_rate": 0.0002, "epoch": 2.9747675962815405, "step": 3360}, {"loss": 1.6607, "grad_norm": 0.4952891170978546, "learning_rate": 0.0002, "epoch": 2.983621071270474, "step": 3370}, {"loss": 1.5981, "grad_norm": 0.42182639241218567, "learning_rate": 0.0002, "epoch": 2.9924745462594067, "step": 3380}, {"eval_loss": 1.8308420181274414, "eval_runtime": 82.786, "eval_samples_per_second": 6.221, "eval_steps_per_second": 0.785, "epoch": 2.9995573262505535, "step": 3388}, {"loss": 1.5811, "grad_norm": 0.47721418738365173, "learning_rate": 0.0002, "epoch": 3.00132802124834, "step": 3390}, {"loss": 1.5137, "grad_norm": 0.5284923911094666, "learning_rate": 0.0002, "epoch": 3.0101814962372733, "step": 3400}, {"loss": 1.437, "grad_norm": 0.5607061982154846, "learning_rate": 0.0002, "epoch": 3.019034971226206, "step": 3410}, {"loss": 1.4909, "grad_norm": 0.5271363258361816, "learning_rate": 0.0002, "epoch": 3.0278884462151394, "step": 3420}, {"loss": 1.5645, "grad_norm": 0.48660898208618164, "learning_rate": 0.0002, "epoch": 3.0367419212040727, "step": 3430}, {"loss": 1.4754, "grad_norm": 0.5767933130264282, "learning_rate": 0.0002, "epoch": 3.0455953961930056, "step": 3440}, {"loss": 1.4647, "grad_norm": 0.5591282248497009, "learning_rate": 0.0002, "epoch": 3.054448871181939, "step": 3450}, {"loss": 1.5112, "grad_norm": 0.5870814323425293, "learning_rate": 0.0002, "epoch": 3.063302346170872, "step": 3460}, {"loss": 1.4682, "grad_norm": 0.4861546456813812, "learning_rate": 0.0002, "epoch": 3.072155821159805, "step": 3470}, {"loss": 1.4883, "grad_norm": 0.5238925814628601, "learning_rate": 0.0002, "epoch": 3.0810092961487383, "step": 3480}, {"loss": 1.4855, "grad_norm": 0.5521751046180725, "learning_rate": 0.0002, "epoch": 3.0898627711376716, "step": 3490}, {"loss": 1.4454, "grad_norm": 0.5816575884819031, "learning_rate": 0.0002, "epoch": 3.098716246126605, "step": 3500}, {"loss": 1.5113, "grad_norm": 0.5281513333320618, "learning_rate": 0.0002, "epoch": 3.1075697211155378, "step": 3510}, {"loss": 1.4723, "grad_norm": 0.5847303867340088, "learning_rate": 0.0002, "epoch": 3.116423196104471, "step": 3520}, {"loss": 1.5513, "grad_norm": 0.5683517456054688, "learning_rate": 0.0002, "epoch": 3.1252766710934043, "step": 3530}, {"loss": 1.532, "grad_norm": 0.5177015662193298, "learning_rate": 0.0002, "epoch": 3.134130146082337, "step": 3540}, {"loss": 1.4921, "grad_norm": 0.5922423601150513, "learning_rate": 0.0002, "epoch": 3.1429836210712705, "step": 3550}, {"loss": 1.5329, "grad_norm": 0.7018587589263916, "learning_rate": 0.0002, "epoch": 3.151837096060204, "step": 3560}, {"loss": 1.4677, "grad_norm": 0.6152004599571228, "learning_rate": 0.0002, "epoch": 3.1606905710491366, "step": 3570}, {"loss": 1.4288, "grad_norm": 0.5350717902183533, "learning_rate": 0.0002, "epoch": 3.16954404603807, "step": 3580}, {"loss": 1.4739, "grad_norm": 0.5971009731292725, "learning_rate": 0.0002, "epoch": 3.1783975210270032, "step": 3590}, {"loss": 1.541, "grad_norm": 0.7312001585960388, "learning_rate": 0.0002, "epoch": 3.187250996015936, "step": 3600}, {"loss": 1.5803, "grad_norm": 0.6372535228729248, "learning_rate": 0.0002, "epoch": 3.1961044710048694, "step": 3610}, {"loss": 1.4642, "grad_norm": 0.6098020672798157, "learning_rate": 0.0002, "epoch": 3.2049579459938027, "step": 3620}, {"loss": 1.5149, "grad_norm": 0.5506435632705688, "learning_rate": 0.0002, "epoch": 3.2138114209827355, "step": 3630}, {"loss": 1.4338, "grad_norm": 0.6043022274971008, "learning_rate": 0.0002, "epoch": 3.222664895971669, "step": 3640}, {"loss": 1.5351, "grad_norm": 0.5495519042015076, "learning_rate": 0.0002, "epoch": 3.231518370960602, "step": 3650}, {"loss": 1.3879, "grad_norm": 0.5769572257995605, "learning_rate": 0.0002, "epoch": 3.240371845949535, "step": 3660}, {"loss": 1.4604, "grad_norm": 0.6833786964416504, "learning_rate": 0.0002, "epoch": 3.2492253209384683, "step": 3670}, {"loss": 1.5091, "grad_norm": 0.6962856650352478, "learning_rate": 0.0002, "epoch": 3.2580787959274016, "step": 3680}, {"loss": 1.5212, "grad_norm": 0.6553098559379578, "learning_rate": 0.0002, "epoch": 3.2669322709163344, "step": 3690}, {"loss": 1.5416, "grad_norm": 0.5907557010650635, "learning_rate": 0.0002, "epoch": 3.2757857459052677, "step": 3700}, {"loss": 1.5012, "grad_norm": 0.5712862014770508, "learning_rate": 0.0002, "epoch": 3.284639220894201, "step": 3710}, {"loss": 1.5073, "grad_norm": 0.573820948600769, "learning_rate": 0.0002, "epoch": 3.2934926958831343, "step": 3720}, {"loss": 1.544, "grad_norm": 0.6650304198265076, "learning_rate": 0.0002, "epoch": 3.302346170872067, "step": 3730}, {"loss": 1.5069, "grad_norm": 0.5182583928108215, "learning_rate": 0.0002, "epoch": 3.3111996458610005, "step": 3740}, {"loss": 1.5254, "grad_norm": 0.5078902840614319, "learning_rate": 0.0002, "epoch": 3.3200531208499338, "step": 3750}, {"loss": 1.4881, "grad_norm": 0.7062374353408813, "learning_rate": 0.0002, "epoch": 3.3289065958388666, "step": 3760}, {"loss": 1.5017, "grad_norm": 0.5711262822151184, "learning_rate": 0.0002, "epoch": 3.3377600708278, "step": 3770}, {"loss": 1.4982, "grad_norm": 0.5624606013298035, "learning_rate": 0.0002, "epoch": 3.346613545816733, "step": 3780}, {"loss": 1.4515, "grad_norm": 0.6008231043815613, "learning_rate": 0.0002, "epoch": 3.355467020805666, "step": 3790}, {"loss": 1.5038, "grad_norm": 0.6120018362998962, "learning_rate": 0.0002, "epoch": 3.3643204957945994, "step": 3800}, {"loss": 1.4918, "grad_norm": 0.5679979920387268, "learning_rate": 0.0002, "epoch": 3.3731739707835326, "step": 3810}, {"loss": 1.5435, "grad_norm": 0.5613794922828674, "learning_rate": 0.0002, "epoch": 3.3820274457724655, "step": 3820}, {"loss": 1.5319, "grad_norm": 0.5328839421272278, "learning_rate": 0.0002, "epoch": 3.390880920761399, "step": 3830}, {"loss": 1.5262, "grad_norm": 0.5960017442703247, "learning_rate": 0.0002, "epoch": 3.399734395750332, "step": 3840}, {"loss": 1.4227, "grad_norm": 0.5264106392860413, "learning_rate": 0.0002, "epoch": 3.4085878707392654, "step": 3850}, {"loss": 1.4766, "grad_norm": 0.6378359198570251, "learning_rate": 0.0002, "epoch": 3.4174413457281982, "step": 3860}, {"loss": 1.4898, "grad_norm": 0.5792967677116394, "learning_rate": 0.0002, "epoch": 3.4262948207171315, "step": 3870}, {"loss": 1.4914, "grad_norm": 0.6836280822753906, "learning_rate": 0.0002, "epoch": 3.435148295706065, "step": 3880}, {"loss": 1.5002, "grad_norm": 0.6073971390724182, "learning_rate": 0.0002, "epoch": 3.4440017706949977, "step": 3890}, {"loss": 1.4473, "grad_norm": 0.5753195881843567, "learning_rate": 0.0002, "epoch": 3.452855245683931, "step": 3900}, {"loss": 1.5332, "grad_norm": 0.6007646918296814, "learning_rate": 0.0002, "epoch": 3.4617087206728643, "step": 3910}, {"loss": 1.515, "grad_norm": 0.6025636196136475, "learning_rate": 0.0002, "epoch": 3.470562195661797, "step": 3920}, {"loss": 1.4612, "grad_norm": 0.6819562315940857, "learning_rate": 0.0002, "epoch": 3.4794156706507304, "step": 3930}, {"loss": 1.518, "grad_norm": 0.6448395848274231, "learning_rate": 0.0002, "epoch": 3.4882691456396637, "step": 3940}, {"loss": 1.5194, "grad_norm": 0.5712178945541382, "learning_rate": 0.0002, "epoch": 3.4971226206285966, "step": 3950}, {"loss": 1.4757, "grad_norm": 0.6300532817840576, "learning_rate": 0.0002, "epoch": 3.50597609561753, "step": 3960}, {"loss": 1.5142, "grad_norm": 0.6120840907096863, "learning_rate": 0.0002, "epoch": 3.514829570606463, "step": 3970}, {"loss": 1.559, "grad_norm": 0.6887575387954712, "learning_rate": 0.0002, "epoch": 3.523683045595396, "step": 3980}, {"loss": 1.5591, "grad_norm": 0.6970235109329224, "learning_rate": 0.0002, "epoch": 3.5325365205843293, "step": 3990}, {"loss": 1.5198, "grad_norm": 0.5818213820457458, "learning_rate": 0.0002, "epoch": 3.5413899955732626, "step": 4000}, {"loss": 1.5367, "grad_norm": 1.0533310174942017, "learning_rate": 0.0002, "epoch": 3.5502434705621955, "step": 4010}, {"loss": 1.5399, "grad_norm": 0.5444280505180359, "learning_rate": 0.0002, "epoch": 3.5590969455511288, "step": 4020}, {"loss": 1.5573, "grad_norm": 0.6007506847381592, "learning_rate": 0.0002, "epoch": 3.567950420540062, "step": 4030}, {"loss": 1.5059, "grad_norm": 0.6088743805885315, "learning_rate": 0.0002, "epoch": 3.576803895528995, "step": 4040}, {"loss": 1.5174, "grad_norm": 0.5934239029884338, "learning_rate": 0.0002, "epoch": 3.585657370517928, "step": 4050}, {"loss": 1.4938, "grad_norm": 0.605251669883728, "learning_rate": 0.0002, "epoch": 3.5945108455068615, "step": 4060}, {"loss": 1.5142, "grad_norm": 0.5903469920158386, "learning_rate": 0.0002, "epoch": 3.6033643204957944, "step": 4070}, {"loss": 1.5234, "grad_norm": 0.6752413511276245, "learning_rate": 0.0002, "epoch": 3.6122177954847277, "step": 4080}, {"loss": 1.5041, "grad_norm": 0.5810418725013733, "learning_rate": 0.0002, "epoch": 3.621071270473661, "step": 4090}, {"loss": 1.5358, "grad_norm": 0.5918573141098022, "learning_rate": 0.0002, "epoch": 3.629924745462594, "step": 4100}, {"loss": 1.499, "grad_norm": 0.6635358333587646, "learning_rate": 0.0002, "epoch": 3.638778220451527, "step": 4110}, {"loss": 1.5021, "grad_norm": 0.5785038471221924, "learning_rate": 0.0002, "epoch": 3.6476316954404604, "step": 4120}, {"loss": 1.5711, "grad_norm": 0.5837879776954651, "learning_rate": 0.0002, "epoch": 3.6564851704293937, "step": 4130}, {"loss": 1.4273, "grad_norm": 0.6449324488639832, "learning_rate": 0.0002, "epoch": 3.6653386454183265, "step": 4140}, {"loss": 1.4608, "grad_norm": 0.6191908717155457, "learning_rate": 0.0002, "epoch": 3.67419212040726, "step": 4150}, {"loss": 1.4567, "grad_norm": 0.6937987208366394, "learning_rate": 0.0002, "epoch": 3.683045595396193, "step": 4160}, {"loss": 1.4136, "grad_norm": 0.581128716468811, "learning_rate": 0.0002, "epoch": 3.6918990703851264, "step": 4170}, {"loss": 1.4204, "grad_norm": 0.6547803282737732, "learning_rate": 0.0002, "epoch": 3.7007525453740593, "step": 4180}, {"loss": 1.4653, "grad_norm": 0.5961150527000427, "learning_rate": 0.0002, "epoch": 3.7096060203629926, "step": 4190}, {"loss": 1.4755, "grad_norm": 0.6197913885116577, "learning_rate": 0.0002, "epoch": 3.718459495351926, "step": 4200}, {"loss": 1.5191, "grad_norm": 0.688565194606781, "learning_rate": 0.0002, "epoch": 3.7273129703408587, "step": 4210}, {"loss": 1.5618, "grad_norm": 0.5832270979881287, "learning_rate": 0.0002, "epoch": 3.736166445329792, "step": 4220}, {"loss": 1.4747, "grad_norm": 0.5643884539604187, "learning_rate": 0.0002, "epoch": 3.7450199203187253, "step": 4230}, {"loss": 1.5242, "grad_norm": 0.6236484050750732, "learning_rate": 0.0002, "epoch": 3.753873395307658, "step": 4240}, {"loss": 1.576, "grad_norm": 0.5367720127105713, "learning_rate": 0.0002, "epoch": 3.7627268702965915, "step": 4250}, {"loss": 1.5234, "grad_norm": 0.5785109400749207, "learning_rate": 0.0002, "epoch": 3.7715803452855248, "step": 4260}, {"loss": 1.4947, "grad_norm": 0.5698465704917908, "learning_rate": 0.0002, "epoch": 3.7804338202744576, "step": 4270}, {"loss": 1.4769, "grad_norm": 0.5748036503791809, "learning_rate": 0.0002, "epoch": 3.789287295263391, "step": 4280}, {"loss": 1.5503, "grad_norm": 0.608147382736206, "learning_rate": 0.0002, "epoch": 3.798140770252324, "step": 4290}, {"loss": 1.5354, "grad_norm": 0.5820456147193909, "learning_rate": 0.0002, "epoch": 3.806994245241257, "step": 4300}, {"loss": 1.5668, "grad_norm": 0.6325612664222717, "learning_rate": 0.0002, "epoch": 3.8158477202301904, "step": 4310}, {"loss": 1.5295, "grad_norm": 0.6465362310409546, "learning_rate": 0.0002, "epoch": 3.8247011952191237, "step": 4320}, {"loss": 1.5048, "grad_norm": 0.5630854368209839, "learning_rate": 0.0002, "epoch": 3.8335546702080565, "step": 4330}, {"loss": 1.5636, "grad_norm": 0.6181462407112122, "learning_rate": 0.0002, "epoch": 3.84240814519699, "step": 4340}, {"loss": 1.5113, "grad_norm": 0.6207571029663086, "learning_rate": 0.0002, "epoch": 3.851261620185923, "step": 4350}, {"loss": 1.5424, "grad_norm": 0.6092919111251831, "learning_rate": 0.0002, "epoch": 3.860115095174856, "step": 4360}, {"loss": 1.5214, "grad_norm": 0.6140493750572205, "learning_rate": 0.0002, "epoch": 3.8689685701637893, "step": 4370}, {"loss": 1.5574, "grad_norm": 0.611575722694397, "learning_rate": 0.0002, "epoch": 3.8778220451527226, "step": 4380}, {"loss": 1.5563, "grad_norm": 0.6288794279098511, "learning_rate": 0.0002, "epoch": 3.8866755201416554, "step": 4390}, {"loss": 1.4967, "grad_norm": 0.6518979072570801, "learning_rate": 0.0002, "epoch": 3.8955289951305887, "step": 4400}, {"loss": 1.5366, "grad_norm": 0.6144753098487854, "learning_rate": 0.0002, "epoch": 3.904382470119522, "step": 4410}, {"loss": 1.6285, "grad_norm": 0.7034937143325806, "learning_rate": 0.0002, "epoch": 3.913235945108455, "step": 4420}, {"loss": 1.4978, "grad_norm": 0.5713187456130981, "learning_rate": 0.0002, "epoch": 3.922089420097388, "step": 4430}, {"loss": 1.5532, "grad_norm": 0.6187576651573181, "learning_rate": 0.0002, "epoch": 3.9309428950863214, "step": 4440}, {"loss": 1.551, "grad_norm": 0.6439383029937744, "learning_rate": 0.0002, "epoch": 3.9397963700752543, "step": 4450}, {"loss": 1.5073, "grad_norm": 0.6133334636688232, "learning_rate": 0.0002, "epoch": 3.9486498450641876, "step": 4460}, {"loss": 1.538, "grad_norm": 0.593463659286499, "learning_rate": 0.0002, "epoch": 3.957503320053121, "step": 4470}, {"loss": 1.5636, "grad_norm": 0.6261998414993286, "learning_rate": 0.0002, "epoch": 3.9663567950420537, "step": 4480}, {"loss": 1.4888, "grad_norm": 0.6153767704963684, "learning_rate": 0.0002, "epoch": 3.975210270030987, "step": 4490}, {"loss": 1.4986, "grad_norm": 0.6184002757072449, "learning_rate": 0.0002, "epoch": 3.9840637450199203, "step": 4500}, {"loss": 1.5134, "grad_norm": 0.5212734341621399, "learning_rate": 0.0002, "epoch": 3.9929172200088536, "step": 4510}, {"eval_loss": 1.8745536804199219, "eval_runtime": 83.0125, "eval_samples_per_second": 6.204, "eval_steps_per_second": 0.783, "epoch": 4.0, "step": 4518}, {"loss": 1.4708, "grad_norm": 0.5871603488922119, "learning_rate": 0.0002, "epoch": 4.001770694997787, "step": 4520}, {"loss": 1.4139, "grad_norm": 0.6746091842651367, "learning_rate": 0.0002, "epoch": 4.01062416998672, "step": 4530}, {"loss": 1.3625, "grad_norm": 0.6159639358520508, "learning_rate": 0.0002, "epoch": 4.019477644975653, "step": 4540}, {"loss": 1.3766, "grad_norm": 0.7529398202896118, "learning_rate": 0.0002, "epoch": 4.028331119964586, "step": 4550}, {"loss": 1.3202, "grad_norm": 0.788398027420044, "learning_rate": 0.0002, "epoch": 4.037184594953519, "step": 4560}, {"loss": 1.4254, "grad_norm": 0.9679850935935974, "learning_rate": 0.0002, "epoch": 4.046038069942452, "step": 4570}, {"loss": 1.2911, "grad_norm": 0.6305310130119324, "learning_rate": 0.0002, "epoch": 4.054891544931386, "step": 4580}, {"loss": 1.3525, "grad_norm": 0.8557451963424683, "learning_rate": 0.0002, "epoch": 4.063745019920319, "step": 4590}, {"loss": 1.3901, "grad_norm": 0.741518497467041, "learning_rate": 0.0002, "epoch": 4.0725984949092515, "step": 4600}, {"loss": 1.3374, "grad_norm": 0.6573862433433533, "learning_rate": 0.0002, "epoch": 4.081451969898185, "step": 4610}, {"loss": 1.3341, "grad_norm": 0.6926319599151611, "learning_rate": 0.0002, "epoch": 4.090305444887118, "step": 4620}, {"loss": 1.4176, "grad_norm": 0.9212626218795776, "learning_rate": 0.0002, "epoch": 4.099158919876051, "step": 4630}, {"loss": 1.3402, "grad_norm": 0.7167867422103882, "learning_rate": 0.0002, "epoch": 4.108012394864985, "step": 4640}, {"loss": 1.3333, "grad_norm": 0.6691595911979675, "learning_rate": 0.0002, "epoch": 4.116865869853918, "step": 4650}, {"loss": 1.247, "grad_norm": 0.8708247542381287, "learning_rate": 0.0002, "epoch": 4.12571934484285, "step": 4660}, {"loss": 1.3599, "grad_norm": 0.8612170219421387, "learning_rate": 0.0002, "epoch": 4.134572819831784, "step": 4670}, {"loss": 1.3418, "grad_norm": 0.7688325047492981, "learning_rate": 0.0002, "epoch": 4.143426294820717, "step": 4680}, {"loss": 1.4349, "grad_norm": 0.7606917023658752, "learning_rate": 0.0002, "epoch": 4.152279769809651, "step": 4690}, {"loss": 1.3521, "grad_norm": 0.8241282105445862, "learning_rate": 0.0002, "epoch": 4.161133244798584, "step": 4700}, {"loss": 1.3325, "grad_norm": 0.7480464577674866, "learning_rate": 0.0002, "epoch": 4.1699867197875164, "step": 4710}, {"loss": 1.4027, "grad_norm": 0.7092460989952087, "learning_rate": 0.0002, "epoch": 4.17884019477645, "step": 4720}, {"loss": 1.4005, "grad_norm": 0.8782108426094055, "learning_rate": 0.0002, "epoch": 4.187693669765383, "step": 4730}, {"loss": 1.3626, "grad_norm": 0.6875300407409668, "learning_rate": 0.0002, "epoch": 4.196547144754316, "step": 4740}, {"loss": 1.3798, "grad_norm": 0.7713887691497803, "learning_rate": 0.0002, "epoch": 4.20540061974325, "step": 4750}, {"loss": 1.3822, "grad_norm": 0.8270819783210754, "learning_rate": 0.0002, "epoch": 4.2142540947321825, "step": 4760}, {"loss": 1.3559, "grad_norm": 0.7109288573265076, "learning_rate": 0.0002, "epoch": 4.223107569721115, "step": 4770}, {"loss": 1.3948, "grad_norm": 0.7209359407424927, "learning_rate": 0.0002, "epoch": 4.231961044710049, "step": 4780}, {"loss": 1.3691, "grad_norm": 0.7142833471298218, "learning_rate": 0.0002, "epoch": 4.240814519698982, "step": 4790}, {"loss": 1.3654, "grad_norm": 0.8526809811592102, "learning_rate": 0.0002, "epoch": 4.249667994687915, "step": 4800}, {"loss": 1.3819, "grad_norm": 0.7064695954322815, "learning_rate": 0.0002, "epoch": 4.2585214696768485, "step": 4810}, {"loss": 1.3333, "grad_norm": 0.7646124362945557, "learning_rate": 0.0002, "epoch": 4.267374944665781, "step": 4820}, {"loss": 1.4247, "grad_norm": 0.7377115488052368, "learning_rate": 0.0002, "epoch": 4.276228419654714, "step": 4830}, {"loss": 1.3683, "grad_norm": 0.7308453321456909, "learning_rate": 0.0002, "epoch": 4.285081894643648, "step": 4840}, {"loss": 1.3653, "grad_norm": 0.6687684059143066, "learning_rate": 0.0002, "epoch": 4.293935369632581, "step": 4850}, {"loss": 1.3538, "grad_norm": 0.7447634339332581, "learning_rate": 0.0002, "epoch": 4.302788844621514, "step": 4860}, {"loss": 1.3842, "grad_norm": 0.7661601305007935, "learning_rate": 0.0002, "epoch": 4.311642319610447, "step": 4870}, {"loss": 1.3783, "grad_norm": 0.7492215037345886, "learning_rate": 0.0002, "epoch": 4.32049579459938, "step": 4880}, {"loss": 1.4089, "grad_norm": 0.9554458856582642, "learning_rate": 0.0002, "epoch": 4.329349269588313, "step": 4890}, {"loss": 1.3582, "grad_norm": 0.7409822940826416, "learning_rate": 0.0002, "epoch": 4.338202744577247, "step": 4900}, {"loss": 1.2581, "grad_norm": 0.9848645329475403, "learning_rate": 0.0002, "epoch": 4.34705621956618, "step": 4910}, {"loss": 1.3809, "grad_norm": 0.803995668888092, "learning_rate": 0.0002, "epoch": 4.355909694555113, "step": 4920}, {"loss": 1.3585, "grad_norm": 0.7480606436729431, "learning_rate": 0.0002, "epoch": 4.364763169544046, "step": 4930}, {"loss": 1.4092, "grad_norm": 0.7018141150474548, "learning_rate": 0.0002, "epoch": 4.373616644532979, "step": 4940}, {"loss": 1.4034, "grad_norm": 0.7684932351112366, "learning_rate": 0.0002, "epoch": 4.382470119521912, "step": 4950}, {"loss": 1.3937, "grad_norm": 0.7849185466766357, "learning_rate": 0.0002, "epoch": 4.391323594510846, "step": 4960}, {"loss": 1.3763, "grad_norm": 0.7858862280845642, "learning_rate": 0.0002, "epoch": 4.400177069499779, "step": 4970}, {"loss": 1.3901, "grad_norm": 0.8270778059959412, "learning_rate": 0.0002, "epoch": 4.4090305444887115, "step": 4980}, {"loss": 1.445, "grad_norm": 0.8464101552963257, "learning_rate": 0.0002, "epoch": 4.417884019477645, "step": 4990}, {"loss": 1.3586, "grad_norm": 0.85670405626297, "learning_rate": 0.0002, "epoch": 4.426737494466578, "step": 5000}, {"loss": 1.4203, "grad_norm": 0.8656655550003052, "learning_rate": 0.0002, "epoch": 4.435590969455511, "step": 5010}, {"loss": 1.3426, "grad_norm": 0.7605292201042175, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 5020}, {"loss": 1.3803, "grad_norm": 0.7682471871376038, "learning_rate": 0.0002, "epoch": 4.4532979194333775, "step": 5030}, {"loss": 1.3432, "grad_norm": 0.7209102511405945, "learning_rate": 0.0002, "epoch": 4.46215139442231, "step": 5040}, {"loss": 1.5126, "grad_norm": 0.8259989023208618, "learning_rate": 0.0002, "epoch": 4.471004869411244, "step": 5050}, {"loss": 1.3709, "grad_norm": 0.7342197895050049, "learning_rate": 0.0002, "epoch": 4.479858344400177, "step": 5060}, {"loss": 1.4196, "grad_norm": 0.7869040369987488, "learning_rate": 0.0002, "epoch": 4.48871181938911, "step": 5070}, {"loss": 1.3734, "grad_norm": 0.7906143665313721, "learning_rate": 0.0002, "epoch": 4.4975652943780435, "step": 5080}, {"loss": 1.3555, "grad_norm": 0.7336861491203308, "learning_rate": 0.0002, "epoch": 4.506418769366976, "step": 5090}, {"loss": 1.3768, "grad_norm": 0.8264166712760925, "learning_rate": 0.0002, "epoch": 4.515272244355909, "step": 5100}, {"loss": 1.3822, "grad_norm": 0.8144693970680237, "learning_rate": 0.0002, "epoch": 4.524125719344843, "step": 5110}, {"loss": 1.3044, "grad_norm": 0.8257269263267517, "learning_rate": 0.0002, "epoch": 4.532979194333776, "step": 5120}, {"loss": 1.3501, "grad_norm": 0.8838174343109131, "learning_rate": 0.0002, "epoch": 4.541832669322709, "step": 5130}, {"loss": 1.3464, "grad_norm": 0.7081145644187927, "learning_rate": 0.0002, "epoch": 4.550686144311642, "step": 5140}, {"loss": 1.342, "grad_norm": 0.7137823700904846, "learning_rate": 0.0002, "epoch": 4.559539619300575, "step": 5150}, {"loss": 1.3788, "grad_norm": 0.7890386581420898, "learning_rate": 0.0002, "epoch": 4.568393094289509, "step": 5160}, {"loss": 1.3368, "grad_norm": 0.6418015360832214, "learning_rate": 0.0002, "epoch": 4.577246569278442, "step": 5170}, {"loss": 1.3892, "grad_norm": 0.768373966217041, "learning_rate": 0.0002, "epoch": 4.586100044267375, "step": 5180}, {"loss": 1.3953, "grad_norm": 0.6934067606925964, "learning_rate": 0.0002, "epoch": 4.5949535192563085, "step": 5190}, {"loss": 1.3782, "grad_norm": 0.9430719017982483, "learning_rate": 0.0002, "epoch": 4.603806994245241, "step": 5200}, {"loss": 1.3981, "grad_norm": 0.880264163017273, "learning_rate": 0.0002, "epoch": 4.612660469234174, "step": 5210}, {"loss": 1.3506, "grad_norm": 0.7584623098373413, "learning_rate": 0.0002, "epoch": 4.621513944223108, "step": 5220}, {"loss": 1.3973, "grad_norm": 0.7974506616592407, "learning_rate": 0.0002, "epoch": 4.630367419212041, "step": 5230}, {"loss": 1.3818, "grad_norm": 0.8812133073806763, "learning_rate": 0.0002, "epoch": 4.639220894200974, "step": 5240}, {"loss": 1.4002, "grad_norm": 0.8968724012374878, "learning_rate": 0.0002, "epoch": 4.648074369189907, "step": 5250}, {"loss": 1.3327, "grad_norm": 0.7317764759063721, "learning_rate": 0.0002, "epoch": 4.65692784417884, "step": 5260}, {"loss": 1.4363, "grad_norm": 0.7415484189987183, "learning_rate": 0.0002, "epoch": 4.665781319167773, "step": 5270}, {"loss": 1.3673, "grad_norm": 0.7867009043693542, "learning_rate": 0.0002, "epoch": 4.674634794156707, "step": 5280}, {"loss": 1.4246, "grad_norm": 0.6895416378974915, "learning_rate": 0.0002, "epoch": 4.68348826914564, "step": 5290}, {"loss": 1.3438, "grad_norm": 0.7324506640434265, "learning_rate": 0.0002, "epoch": 4.6923417441345725, "step": 5300}, {"loss": 1.4072, "grad_norm": 0.7383193969726562, "learning_rate": 0.0002, "epoch": 4.701195219123506, "step": 5310}, {"loss": 1.3269, "grad_norm": 0.8254916071891785, "learning_rate": 0.0002, "epoch": 4.710048694112439, "step": 5320}, {"loss": 1.4317, "grad_norm": 0.8161033987998962, "learning_rate": 0.0002, "epoch": 4.718902169101372, "step": 5330}, {"loss": 1.3623, "grad_norm": 0.7664386034011841, "learning_rate": 0.0002, "epoch": 4.727755644090306, "step": 5340}, {"loss": 1.4293, "grad_norm": 0.7465475797653198, "learning_rate": 0.0002, "epoch": 4.7366091190792385, "step": 5350}, {"loss": 1.3435, "grad_norm": 0.7810078263282776, "learning_rate": 0.0002, "epoch": 4.745462594068171, "step": 5360}, {"loss": 1.4489, "grad_norm": 0.7428439855575562, "learning_rate": 0.0002, "epoch": 4.754316069057105, "step": 5370}, {"loss": 1.3607, "grad_norm": 0.9548320174217224, "learning_rate": 0.0002, "epoch": 4.763169544046038, "step": 5380}, {"loss": 1.3398, "grad_norm": 0.7959533333778381, "learning_rate": 0.0002, "epoch": 4.772023019034972, "step": 5390}, {"loss": 1.3448, "grad_norm": 0.747473418712616, "learning_rate": 0.0002, "epoch": 4.780876494023905, "step": 5400}, {"loss": 1.3954, "grad_norm": 0.7863122820854187, "learning_rate": 0.0002, "epoch": 4.789729969012837, "step": 5410}, {"loss": 1.4166, "grad_norm": 0.7769626379013062, "learning_rate": 0.0002, "epoch": 4.798583444001771, "step": 5420}, {"loss": 1.4484, "grad_norm": 0.8551191091537476, "learning_rate": 0.0002, "epoch": 4.807436918990704, "step": 5430}, {"loss": 1.4314, "grad_norm": 0.8364850878715515, "learning_rate": 0.0002, "epoch": 4.816290393979637, "step": 5440}, {"loss": 1.4028, "grad_norm": 0.7458856701850891, "learning_rate": 0.0002, "epoch": 4.825143868968571, "step": 5450}, {"loss": 1.3923, "grad_norm": 0.7558291554450989, "learning_rate": 0.0002, "epoch": 4.8339973439575035, "step": 5460}, {"loss": 1.3343, "grad_norm": 0.8396534323692322, "learning_rate": 0.0002, "epoch": 4.842850818946436, "step": 5470}, {"loss": 1.3853, "grad_norm": 0.7790794968605042, "learning_rate": 0.0002, "epoch": 4.85170429393537, "step": 5480}, {"loss": 1.406, "grad_norm": 0.8607641458511353, "learning_rate": 0.0002, "epoch": 4.860557768924303, "step": 5490}, {"loss": 1.4011, "grad_norm": 0.828134298324585, "learning_rate": 0.0002, "epoch": 4.869411243913236, "step": 5500}, {"loss": 1.4089, "grad_norm": 0.8783106803894043, "learning_rate": 0.0002, "epoch": 4.8782647189021695, "step": 5510}, {"loss": 1.4565, "grad_norm": 0.7476183176040649, "learning_rate": 0.0002, "epoch": 4.887118193891102, "step": 5520}, {"loss": 1.3974, "grad_norm": 0.8023254871368408, "learning_rate": 0.0002, "epoch": 4.895971668880035, "step": 5530}, {"loss": 1.2979, "grad_norm": 0.8021706938743591, "learning_rate": 0.0002, "epoch": 4.904825143868969, "step": 5540}, {"loss": 1.4139, "grad_norm": 0.7873618602752686, "learning_rate": 0.0002, "epoch": 4.913678618857902, "step": 5550}, {"loss": 1.4393, "grad_norm": 0.7181428670883179, "learning_rate": 0.0002, "epoch": 4.922532093846835, "step": 5560}, {"loss": 1.3968, "grad_norm": 0.7464273571968079, "learning_rate": 0.0002, "epoch": 4.931385568835768, "step": 5570}, {"loss": 1.3184, "grad_norm": 0.7433671355247498, "learning_rate": 0.0002, "epoch": 4.940239043824701, "step": 5580}, {"loss": 1.4174, "grad_norm": 0.7571114301681519, "learning_rate": 0.0002, "epoch": 4.949092518813634, "step": 5590}, {"loss": 1.4418, "grad_norm": 0.7811630964279175, "learning_rate": 0.0002, "epoch": 4.957945993802568, "step": 5600}, {"loss": 1.4288, "grad_norm": 0.7609148621559143, "learning_rate": 0.0002, "epoch": 4.966799468791501, "step": 5610}, {"loss": 1.3786, "grad_norm": 0.7324382066726685, "learning_rate": 0.0002, "epoch": 4.9756529437804335, "step": 5620}, {"loss": 1.4557, "grad_norm": 0.9249559640884399, "learning_rate": 0.0002, "epoch": 4.984506418769367, "step": 5630}, {"loss": 1.4064, "grad_norm": 0.7852522134780884, "learning_rate": 0.0002, "epoch": 4.9933598937583, "step": 5640}, {"eval_loss": 1.9384633302688599, "eval_runtime": 82.6042, "eval_samples_per_second": 6.235, "eval_steps_per_second": 0.787, "epoch": 4.999557326250553, "step": 5647}, {"loss": 1.4261, "grad_norm": 0.8052749037742615, "learning_rate": 0.0002, "epoch": 5.002213368747233, "step": 5650}, {"loss": 1.1967, "grad_norm": 1.380603551864624, "learning_rate": 0.0002, "epoch": 5.011066843736167, "step": 5660}, {"loss": 1.1871, "grad_norm": 0.9197829365730286, "learning_rate": 0.0002, "epoch": 5.0199203187251, "step": 5670}, {"loss": 1.1966, "grad_norm": 0.9338570833206177, "learning_rate": 0.0002, "epoch": 5.028773793714032, "step": 5680}, {"loss": 1.1866, "grad_norm": 1.0464060306549072, "learning_rate": 0.0002, "epoch": 5.037627268702966, "step": 5690}, {"loss": 1.2211, "grad_norm": 0.9055638909339905, "learning_rate": 0.0002, "epoch": 5.046480743691899, "step": 5700}, {"loss": 1.1987, "grad_norm": 0.9494627714157104, "learning_rate": 0.0002, "epoch": 5.055334218680832, "step": 5710}, {"loss": 1.2647, "grad_norm": 0.9680962562561035, "learning_rate": 0.0002, "epoch": 5.064187693669766, "step": 5720}, {"loss": 1.2452, "grad_norm": 1.0254695415496826, "learning_rate": 0.0002, "epoch": 5.0730411686586985, "step": 5730}, {"loss": 1.2006, "grad_norm": 0.9306758642196655, "learning_rate": 0.0002, "epoch": 5.081894643647631, "step": 5740}, {"loss": 1.2254, "grad_norm": 1.0620356798171997, "learning_rate": 0.0002, "epoch": 5.090748118636565, "step": 5750}, {"loss": 1.2628, "grad_norm": 1.0401700735092163, "learning_rate": 0.0002, "epoch": 5.099601593625498, "step": 5760}, {"loss": 1.1976, "grad_norm": 0.9916906952857971, "learning_rate": 0.0002, "epoch": 5.108455068614431, "step": 5770}, {"loss": 1.2847, "grad_norm": 0.8387252688407898, "learning_rate": 0.0002, "epoch": 5.1173085436033645, "step": 5780}, {"loss": 1.2472, "grad_norm": 0.9870850443840027, "learning_rate": 0.0002, "epoch": 5.126162018592297, "step": 5790}, {"loss": 1.1902, "grad_norm": 0.9204064011573792, "learning_rate": 0.0002, "epoch": 5.13501549358123, "step": 5800}, {"loss": 1.2266, "grad_norm": 0.9951931834220886, "learning_rate": 0.0002, "epoch": 5.143868968570164, "step": 5810}, {"loss": 1.2113, "grad_norm": 0.9745809435844421, "learning_rate": 0.0002, "epoch": 5.152722443559097, "step": 5820}, {"loss": 1.2549, "grad_norm": 0.9467785954475403, "learning_rate": 0.0002, "epoch": 5.16157591854803, "step": 5830}, {"loss": 1.2309, "grad_norm": 1.0451668500900269, "learning_rate": 0.0002, "epoch": 5.170429393536963, "step": 5840}, {"loss": 1.2215, "grad_norm": 0.9740142822265625, "learning_rate": 0.0002, "epoch": 5.179282868525896, "step": 5850}, {"loss": 1.2137, "grad_norm": 1.2158266305923462, "learning_rate": 0.0002, "epoch": 5.18813634351483, "step": 5860}, {"loss": 1.1631, "grad_norm": 1.0795036554336548, "learning_rate": 0.0002, "epoch": 5.196989818503763, "step": 5870}, {"loss": 1.1448, "grad_norm": 0.9578470587730408, "learning_rate": 0.0002, "epoch": 5.205843293492696, "step": 5880}, {"loss": 1.2183, "grad_norm": 0.8887509703636169, "learning_rate": 0.0002, "epoch": 5.214696768481629, "step": 5890}, {"loss": 1.1991, "grad_norm": 1.171006441116333, "learning_rate": 0.0002, "epoch": 5.223550243470562, "step": 5900}, {"loss": 1.1781, "grad_norm": 0.9016029834747314, "learning_rate": 0.0002, "epoch": 5.232403718459495, "step": 5910}, {"loss": 1.2057, "grad_norm": 1.173136830329895, "learning_rate": 0.0002, "epoch": 5.241257193448429, "step": 5920}, {"loss": 1.2856, "grad_norm": 0.8760318160057068, "learning_rate": 0.0002, "epoch": 5.250110668437362, "step": 5930}, {"loss": 1.2301, "grad_norm": 0.8998854160308838, "learning_rate": 0.0002, "epoch": 5.258964143426295, "step": 5940}, {"loss": 1.3058, "grad_norm": 1.017175316810608, "learning_rate": 0.0002, "epoch": 5.267817618415228, "step": 5950}, {"loss": 1.2552, "grad_norm": 0.8646609783172607, "learning_rate": 0.0002, "epoch": 5.276671093404161, "step": 5960}, {"loss": 1.2044, "grad_norm": 1.0030627250671387, "learning_rate": 0.0002, "epoch": 5.285524568393094, "step": 5970}, {"loss": 1.2365, "grad_norm": 0.975911557674408, "learning_rate": 0.0002, "epoch": 5.294378043382028, "step": 5980}, {"loss": 1.2307, "grad_norm": 0.9576130509376526, "learning_rate": 0.0002, "epoch": 5.303231518370961, "step": 5990}, {"loss": 1.2681, "grad_norm": 0.9566167593002319, "learning_rate": 0.0002, "epoch": 5.3120849933598935, "step": 6000}, {"loss": 1.2029, "grad_norm": 0.9200350642204285, "learning_rate": 0.0002, "epoch": 5.320938468348827, "step": 6010}, {"loss": 1.1871, "grad_norm": 1.0491118431091309, "learning_rate": 0.0002, "epoch": 5.32979194333776, "step": 6020}, {"loss": 1.2531, "grad_norm": 1.1199153661727905, "learning_rate": 0.0002, "epoch": 5.338645418326693, "step": 6030}, {"loss": 1.265, "grad_norm": 1.015252947807312, "learning_rate": 0.0002, "epoch": 5.347498893315627, "step": 6040}, {"loss": 1.2208, "grad_norm": 1.1076666116714478, "learning_rate": 0.0002, "epoch": 5.3563523683045595, "step": 6050}, {"loss": 1.1953, "grad_norm": 0.9224653840065002, "learning_rate": 0.0002, "epoch": 5.365205843293492, "step": 6060}, {"loss": 1.2045, "grad_norm": 1.0079779624938965, "learning_rate": 0.0002, "epoch": 5.374059318282426, "step": 6070}, {"loss": 1.2612, "grad_norm": 0.9627894759178162, "learning_rate": 0.0002, "epoch": 5.382912793271359, "step": 6080}, {"loss": 1.3116, "grad_norm": 1.0503166913986206, "learning_rate": 0.0002, "epoch": 5.391766268260292, "step": 6090}, {"loss": 1.2565, "grad_norm": 0.912736713886261, "learning_rate": 0.0002, "epoch": 5.400619743249226, "step": 6100}, {"loss": 1.204, "grad_norm": 1.2552032470703125, "learning_rate": 0.0002, "epoch": 5.409473218238158, "step": 6110}, {"loss": 1.2738, "grad_norm": 0.986230731010437, "learning_rate": 0.0002, "epoch": 5.418326693227091, "step": 6120}, {"loss": 1.3301, "grad_norm": 0.9869757294654846, "learning_rate": 0.0002, "epoch": 5.427180168216025, "step": 6130}, {"loss": 1.241, "grad_norm": 1.012027621269226, "learning_rate": 0.0002, "epoch": 5.436033643204958, "step": 6140}, {"loss": 1.224, "grad_norm": 0.8855568170547485, "learning_rate": 0.0002, "epoch": 5.444887118193891, "step": 6150}, {"loss": 1.2539, "grad_norm": 1.1522414684295654, "learning_rate": 0.0002, "epoch": 5.4537405931828244, "step": 6160}, {"loss": 1.2402, "grad_norm": 1.2448474168777466, "learning_rate": 0.0002, "epoch": 5.462594068171757, "step": 6170}, {"loss": 1.179, "grad_norm": 1.0362223386764526, "learning_rate": 0.0002, "epoch": 5.471447543160691, "step": 6180}, {"loss": 1.2351, "grad_norm": 0.9363031983375549, "learning_rate": 0.0002, "epoch": 5.480301018149624, "step": 6190}, {"loss": 1.2394, "grad_norm": 0.8852020502090454, "learning_rate": 0.0002, "epoch": 5.489154493138557, "step": 6200}, {"loss": 1.311, "grad_norm": 0.8577062487602234, "learning_rate": 0.0002, "epoch": 5.4980079681274905, "step": 6210}, {"loss": 1.2547, "grad_norm": 0.9351891875267029, "learning_rate": 0.0002, "epoch": 5.506861443116423, "step": 6220}, {"loss": 1.2804, "grad_norm": 1.0031992197036743, "learning_rate": 0.0002, "epoch": 5.515714918105356, "step": 6230}, {"loss": 1.219, "grad_norm": 0.9935104250907898, "learning_rate": 0.0002, "epoch": 5.52456839309429, "step": 6240}, {"loss": 1.2756, "grad_norm": 1.1086243391036987, "learning_rate": 0.0002, "epoch": 5.533421868083223, "step": 6250}, {"loss": 1.2751, "grad_norm": 0.990772545337677, "learning_rate": 0.0002, "epoch": 5.542275343072156, "step": 6260}, {"loss": 1.2756, "grad_norm": 0.9317597150802612, "learning_rate": 0.0002, "epoch": 5.551128818061089, "step": 6270}, {"loss": 1.2095, "grad_norm": 0.9657552242279053, "learning_rate": 0.0002, "epoch": 5.559982293050022, "step": 6280}, {"loss": 1.2435, "grad_norm": 1.0271565914154053, "learning_rate": 0.0002, "epoch": 5.568835768038955, "step": 6290}, {"loss": 1.2283, "grad_norm": 0.916253924369812, "learning_rate": 0.0002, "epoch": 5.577689243027889, "step": 6300}, {"loss": 1.2648, "grad_norm": 1.0083940029144287, "learning_rate": 0.0002, "epoch": 5.586542718016822, "step": 6310}, {"loss": 1.2904, "grad_norm": 0.9740358591079712, "learning_rate": 0.0002, "epoch": 5.5953961930057545, "step": 6320}, {"loss": 1.2507, "grad_norm": 0.9645405411720276, "learning_rate": 0.0002, "epoch": 5.604249667994688, "step": 6330}, {"loss": 1.2845, "grad_norm": 0.9677100777626038, "learning_rate": 0.0002, "epoch": 5.613103142983621, "step": 6340}, {"loss": 1.2936, "grad_norm": 0.9706602692604065, "learning_rate": 0.0002, "epoch": 5.621956617972554, "step": 6350}, {"loss": 1.2541, "grad_norm": 1.1492316722869873, "learning_rate": 0.0002, "epoch": 5.630810092961488, "step": 6360}, {"loss": 1.2242, "grad_norm": 0.8857277035713196, "learning_rate": 0.0002, "epoch": 5.639663567950421, "step": 6370}, {"loss": 1.2178, "grad_norm": 1.0363037586212158, "learning_rate": 0.0002, "epoch": 5.648517042939353, "step": 6380}, {"loss": 1.1838, "grad_norm": 0.9621800780296326, "learning_rate": 0.0002, "epoch": 5.657370517928287, "step": 6390}, {"loss": 1.2472, "grad_norm": 0.9937820434570312, "learning_rate": 0.0002, "epoch": 5.66622399291722, "step": 6400}, {"loss": 1.2523, "grad_norm": 0.9491283297538757, "learning_rate": 0.0002, "epoch": 5.675077467906153, "step": 6410}, {"loss": 1.2539, "grad_norm": 0.9429448246955872, "learning_rate": 0.0002, "epoch": 5.683930942895087, "step": 6420}, {"loss": 1.1663, "grad_norm": 0.9808844327926636, "learning_rate": 0.0002, "epoch": 5.6927844178840195, "step": 6430}, {"loss": 1.2574, "grad_norm": 0.8191056847572327, "learning_rate": 0.0002, "epoch": 5.701637892872952, "step": 6440}, {"loss": 1.2659, "grad_norm": 1.1118974685668945, "learning_rate": 0.0002, "epoch": 5.710491367861886, "step": 6450}, {"loss": 1.2192, "grad_norm": 0.9030969142913818, "learning_rate": 0.0002, "epoch": 5.719344842850819, "step": 6460}, {"loss": 1.301, "grad_norm": 1.0509997606277466, "learning_rate": 0.0002, "epoch": 5.728198317839752, "step": 6470}, {"loss": 1.217, "grad_norm": 1.0369981527328491, "learning_rate": 0.0002, "epoch": 5.7370517928286855, "step": 6480}, {"loss": 1.2518, "grad_norm": 0.8626071214675903, "learning_rate": 0.0002, "epoch": 5.745905267817618, "step": 6490}, {"loss": 1.2446, "grad_norm": 1.0448849201202393, "learning_rate": 0.0002, "epoch": 5.754758742806551, "step": 6500}, {"loss": 1.2698, "grad_norm": 0.9333119988441467, "learning_rate": 0.0002, "epoch": 5.763612217795485, "step": 6510}, {"loss": 1.2655, "grad_norm": 0.8533532023429871, "learning_rate": 0.0002, "epoch": 5.772465692784418, "step": 6520}, {"loss": 1.3037, "grad_norm": 0.9774261713027954, "learning_rate": 0.0002, "epoch": 5.781319167773351, "step": 6530}, {"loss": 1.2031, "grad_norm": 0.9841071963310242, "learning_rate": 0.0002, "epoch": 5.790172642762284, "step": 6540}, {"loss": 1.2767, "grad_norm": 0.9891805052757263, "learning_rate": 0.0002, "epoch": 5.799026117751217, "step": 6550}, {"loss": 1.3373, "grad_norm": 0.9633952379226685, "learning_rate": 0.0002, "epoch": 5.80787959274015, "step": 6560}, {"loss": 1.1939, "grad_norm": 1.327634334564209, "learning_rate": 0.0002, "epoch": 5.816733067729084, "step": 6570}, {"loss": 1.2985, "grad_norm": 0.9805197715759277, "learning_rate": 0.0002, "epoch": 5.825586542718017, "step": 6580}, {"loss": 1.1933, "grad_norm": 1.020957589149475, "learning_rate": 0.0002, "epoch": 5.8344400177069495, "step": 6590}, {"loss": 1.2582, "grad_norm": 0.9694032669067383, "learning_rate": 0.0002, "epoch": 5.843293492695883, "step": 6600}, {"loss": 1.2671, "grad_norm": 0.8980914354324341, "learning_rate": 0.0002, "epoch": 5.852146967684816, "step": 6610}, {"loss": 1.3391, "grad_norm": 0.8312330842018127, "learning_rate": 0.0002, "epoch": 5.861000442673749, "step": 6620}, {"loss": 1.3301, "grad_norm": 0.9773725271224976, "learning_rate": 0.0002, "epoch": 5.869853917662683, "step": 6630}, {"loss": 1.2697, "grad_norm": 0.9684233665466309, "learning_rate": 0.0002, "epoch": 5.878707392651616, "step": 6640}, {"loss": 1.2866, "grad_norm": 0.8436519503593445, "learning_rate": 0.0002, "epoch": 5.887560867640548, "step": 6650}, {"loss": 1.2213, "grad_norm": 0.9129888415336609, "learning_rate": 0.0002, "epoch": 5.896414342629482, "step": 6660}, {"loss": 1.3272, "grad_norm": 0.8871369957923889, "learning_rate": 0.0002, "epoch": 5.905267817618415, "step": 6670}, {"loss": 1.3758, "grad_norm": 0.9544420838356018, "learning_rate": 0.0002, "epoch": 5.914121292607349, "step": 6680}, {"loss": 1.2954, "grad_norm": 0.9607448577880859, "learning_rate": 0.0002, "epoch": 5.922974767596282, "step": 6690}, {"loss": 1.2448, "grad_norm": 0.9675708413124084, "learning_rate": 0.0002, "epoch": 5.9318282425852145, "step": 6700}, {"loss": 1.3208, "grad_norm": 0.9373534321784973, "learning_rate": 0.0002, "epoch": 5.940681717574148, "step": 6710}, {"loss": 1.2982, "grad_norm": 0.9750351905822754, "learning_rate": 0.0002, "epoch": 5.949535192563081, "step": 6720}, {"loss": 1.2575, "grad_norm": 0.9122727513313293, "learning_rate": 0.0002, "epoch": 5.958388667552014, "step": 6730}, {"loss": 1.2259, "grad_norm": 0.9300726652145386, "learning_rate": 0.0002, "epoch": 5.967242142540948, "step": 6740}, {"loss": 1.2634, "grad_norm": 0.972944438457489, "learning_rate": 0.0002, "epoch": 5.9760956175298805, "step": 6750}, {"loss": 1.3252, "grad_norm": 1.2385832071304321, "learning_rate": 0.0002, "epoch": 5.984949092518813, "step": 6760}, {"loss": 1.2417, "grad_norm": 0.9080338478088379, "learning_rate": 0.0002, "epoch": 5.993802567507747, "step": 6770}, {"eval_loss": 2.062002658843994, "eval_runtime": 83.2814, "eval_samples_per_second": 6.184, "eval_steps_per_second": 0.78, "epoch": 6.0, "step": 6777}, {"loss": 1.2408, "grad_norm": 0.8741096258163452, "learning_rate": 0.0002, "epoch": 6.00265604249668, "step": 6780}, {"loss": 1.1242, "grad_norm": 1.2510347366333008, "learning_rate": 0.0002, "epoch": 6.011509517485613, "step": 6790}, {"loss": 1.0269, "grad_norm": 1.063910722732544, "learning_rate": 0.0002, "epoch": 6.0203629924745465, "step": 6800}, {"loss": 1.0468, "grad_norm": 1.169573187828064, "learning_rate": 0.0002, "epoch": 6.029216467463479, "step": 6810}, {"loss": 1.1221, "grad_norm": 1.0453242063522339, "learning_rate": 0.0002, "epoch": 6.038069942452412, "step": 6820}, {"loss": 1.0469, "grad_norm": 1.1960445642471313, "learning_rate": 0.0002, "epoch": 6.046923417441346, "step": 6830}, {"loss": 1.1233, "grad_norm": 0.9427650570869446, "learning_rate": 0.0002, "epoch": 6.055776892430279, "step": 6840}, {"loss": 1.0114, "grad_norm": 1.2107350826263428, "learning_rate": 0.0002, "epoch": 6.064630367419212, "step": 6850}, {"loss": 1.0751, "grad_norm": 1.262130856513977, "learning_rate": 0.0002, "epoch": 6.073483842408145, "step": 6860}, {"loss": 1.0787, "grad_norm": 1.1628082990646362, "learning_rate": 0.0002, "epoch": 6.082337317397078, "step": 6870}, {"loss": 1.0828, "grad_norm": 1.0090514421463013, "learning_rate": 0.0002, "epoch": 6.091190792386011, "step": 6880}, {"loss": 1.0718, "grad_norm": 1.5029802322387695, "learning_rate": 0.0002, "epoch": 6.100044267374945, "step": 6890}, {"loss": 1.0549, "grad_norm": 1.0522133111953735, "learning_rate": 0.0002, "epoch": 6.108897742363878, "step": 6900}, {"loss": 1.0502, "grad_norm": 1.225534439086914, "learning_rate": 0.0002, "epoch": 6.117751217352811, "step": 6910}, {"loss": 1.0808, "grad_norm": 1.2859058380126953, "learning_rate": 0.0002, "epoch": 6.126604692341744, "step": 6920}, {"loss": 1.1206, "grad_norm": 1.215205192565918, "learning_rate": 0.0002, "epoch": 6.135458167330677, "step": 6930}, {"loss": 1.1442, "grad_norm": 1.1799274682998657, "learning_rate": 0.0002, "epoch": 6.14431164231961, "step": 6940}, {"loss": 1.0749, "grad_norm": 1.2553550004959106, "learning_rate": 0.0002, "epoch": 6.153165117308544, "step": 6950}, {"loss": 1.1427, "grad_norm": 1.2171931266784668, "learning_rate": 0.0002, "epoch": 6.162018592297477, "step": 6960}, {"loss": 1.0579, "grad_norm": 1.1896923780441284, "learning_rate": 0.0002, "epoch": 6.17087206728641, "step": 6970}, {"loss": 1.1477, "grad_norm": 1.007250189781189, "learning_rate": 0.0002, "epoch": 6.179725542275343, "step": 6980}, {"loss": 1.1551, "grad_norm": 1.2109580039978027, "learning_rate": 0.0002, "epoch": 6.188579017264276, "step": 6990}, {"loss": 1.0809, "grad_norm": 1.2197009325027466, "learning_rate": 0.0002, "epoch": 6.19743249225321, "step": 7000}, {"loss": 1.1322, "grad_norm": 1.1417629718780518, "learning_rate": 0.0002, "epoch": 6.206285967242143, "step": 7010}, {"loss": 1.0541, "grad_norm": 1.2337356805801392, "learning_rate": 0.0002, "epoch": 6.2151394422310755, "step": 7020}, {"loss": 1.0195, "grad_norm": 1.1230454444885254, "learning_rate": 0.0002, "epoch": 6.223992917220009, "step": 7030}, {"loss": 1.1873, "grad_norm": 1.0634387731552124, "learning_rate": 0.0002, "epoch": 6.232846392208942, "step": 7040}, {"loss": 1.0892, "grad_norm": 1.1566855907440186, "learning_rate": 0.0002, "epoch": 6.241699867197875, "step": 7050}, {"loss": 1.063, "grad_norm": 1.2251075506210327, "learning_rate": 0.0002, "epoch": 6.250553342186809, "step": 7060}, {"loss": 1.1169, "grad_norm": 1.2232472896575928, "learning_rate": 0.0002, "epoch": 6.2594068171757415, "step": 7070}, {"loss": 1.0394, "grad_norm": 1.1014091968536377, "learning_rate": 0.0002, "epoch": 6.268260292164674, "step": 7080}, {"loss": 1.0627, "grad_norm": 1.322811245918274, "learning_rate": 0.0002, "epoch": 6.277113767153608, "step": 7090}, {"loss": 1.1108, "grad_norm": 0.9820072650909424, "learning_rate": 0.0002, "epoch": 6.285967242142541, "step": 7100}, {"loss": 1.0823, "grad_norm": 1.13047456741333, "learning_rate": 0.0002, "epoch": 6.294820717131474, "step": 7110}, {"loss": 1.1012, "grad_norm": 1.145127534866333, "learning_rate": 0.0002, "epoch": 6.303674192120408, "step": 7120}, {"loss": 1.089, "grad_norm": 1.101465106010437, "learning_rate": 0.0002, "epoch": 6.31252766710934, "step": 7130}, {"loss": 1.1122, "grad_norm": 1.131705641746521, "learning_rate": 0.0002, "epoch": 6.321381142098273, "step": 7140}, {"loss": 1.0173, "grad_norm": 0.9876824617385864, "learning_rate": 0.0002, "epoch": 6.330234617087207, "step": 7150}, {"loss": 1.0184, "grad_norm": 1.2950096130371094, "learning_rate": 0.0002, "epoch": 6.33908809207614, "step": 7160}, {"loss": 1.0559, "grad_norm": 1.0496132373809814, "learning_rate": 0.0002, "epoch": 6.347941567065073, "step": 7170}, {"loss": 1.1334, "grad_norm": 1.3835711479187012, "learning_rate": 0.0002, "epoch": 6.3567950420540065, "step": 7180}, {"loss": 0.9777, "grad_norm": 1.176424503326416, "learning_rate": 0.0002, "epoch": 6.365648517042939, "step": 7190}, {"loss": 1.1034, "grad_norm": 1.3502846956253052, "learning_rate": 0.0002, "epoch": 6.374501992031872, "step": 7200}, {"loss": 1.0614, "grad_norm": 1.2429769039154053, "learning_rate": 0.0002, "epoch": 6.383355467020806, "step": 7210}, {"loss": 1.1712, "grad_norm": 1.138015866279602, "learning_rate": 0.0002, "epoch": 6.392208942009739, "step": 7220}, {"loss": 1.1602, "grad_norm": 1.4407539367675781, "learning_rate": 0.0002, "epoch": 6.401062416998672, "step": 7230}, {"loss": 1.1595, "grad_norm": 1.1464104652404785, "learning_rate": 0.0002, "epoch": 6.409915891987605, "step": 7240}, {"loss": 1.1381, "grad_norm": 1.2028888463974, "learning_rate": 0.0002, "epoch": 6.418769366976538, "step": 7250}, {"loss": 1.1129, "grad_norm": 1.132938027381897, "learning_rate": 0.0002, "epoch": 6.427622841965471, "step": 7260}, {"loss": 1.0662, "grad_norm": 1.2005301713943481, "learning_rate": 0.0002, "epoch": 6.436476316954405, "step": 7270}, {"loss": 1.0538, "grad_norm": 1.0460501909255981, "learning_rate": 0.0002, "epoch": 6.445329791943338, "step": 7280}, {"loss": 1.0958, "grad_norm": 1.1363240480422974, "learning_rate": 0.0002, "epoch": 6.4541832669322705, "step": 7290}, {"loss": 1.1042, "grad_norm": 1.0439460277557373, "learning_rate": 0.0002, "epoch": 6.463036741921204, "step": 7300}, {"loss": 1.0896, "grad_norm": 1.1968905925750732, "learning_rate": 0.0002, "epoch": 6.471890216910137, "step": 7310}, {"loss": 1.0891, "grad_norm": 1.0443525314331055, "learning_rate": 0.0002, "epoch": 6.48074369189907, "step": 7320}, {"loss": 1.1384, "grad_norm": 1.2550246715545654, "learning_rate": 0.0002, "epoch": 6.489597166888004, "step": 7330}, {"loss": 1.2028, "grad_norm": 1.2880409955978394, "learning_rate": 0.0002, "epoch": 6.4984506418769366, "step": 7340}, {"loss": 1.1173, "grad_norm": 1.2390265464782715, "learning_rate": 0.0002, "epoch": 6.507304116865869, "step": 7350}, {"loss": 1.065, "grad_norm": 1.0650159120559692, "learning_rate": 0.0002, "epoch": 6.516157591854803, "step": 7360}, {"loss": 1.1072, "grad_norm": 1.4934154748916626, "learning_rate": 0.0002, "epoch": 6.525011066843736, "step": 7370}, {"loss": 1.0436, "grad_norm": 1.0902682542800903, "learning_rate": 0.0002, "epoch": 6.533864541832669, "step": 7380}, {"loss": 1.145, "grad_norm": 1.1561789512634277, "learning_rate": 0.0002, "epoch": 6.542718016821603, "step": 7390}, {"loss": 1.1633, "grad_norm": 1.1010485887527466, "learning_rate": 0.0002, "epoch": 6.551571491810535, "step": 7400}, {"loss": 1.1063, "grad_norm": 1.1616493463516235, "learning_rate": 0.0002, "epoch": 6.560424966799468, "step": 7410}, {"loss": 1.1217, "grad_norm": 1.2321627140045166, "learning_rate": 0.0002, "epoch": 6.569278441788402, "step": 7420}, {"loss": 1.135, "grad_norm": 1.162299394607544, "learning_rate": 0.0002, "epoch": 6.578131916777335, "step": 7430}, {"loss": 1.1785, "grad_norm": 0.9935213923454285, "learning_rate": 0.0002, "epoch": 6.586985391766269, "step": 7440}, {"loss": 1.078, "grad_norm": 1.3035451173782349, "learning_rate": 0.0002, "epoch": 6.5958388667552015, "step": 7450}, {"loss": 1.1377, "grad_norm": 1.0957173109054565, "learning_rate": 0.0002, "epoch": 6.604692341744134, "step": 7460}, {"loss": 1.1882, "grad_norm": 1.166472315788269, "learning_rate": 0.0002, "epoch": 6.613545816733068, "step": 7470}, {"loss": 1.1379, "grad_norm": 1.332716464996338, "learning_rate": 0.0002, "epoch": 6.622399291722001, "step": 7480}, {"loss": 1.1686, "grad_norm": 1.1008102893829346, "learning_rate": 0.0002, "epoch": 6.631252766710934, "step": 7490}, {"loss": 1.1644, "grad_norm": 1.4472310543060303, "learning_rate": 0.0002, "epoch": 6.6401062416998675, "step": 7500}, {"loss": 1.1729, "grad_norm": 1.1247508525848389, "learning_rate": 0.0002, "epoch": 6.6489597166888, "step": 7510}, {"loss": 1.1649, "grad_norm": 1.297936201095581, "learning_rate": 0.0002, "epoch": 6.657813191677733, "step": 7520}, {"loss": 1.1178, "grad_norm": 1.0784718990325928, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 7530}, {"loss": 1.0852, "grad_norm": 1.1518864631652832, "learning_rate": 0.0002, "epoch": 6.6755201416556, "step": 7540}, {"loss": 1.1611, "grad_norm": 1.1135684251785278, "learning_rate": 0.0002, "epoch": 6.684373616644533, "step": 7550}, {"loss": 1.1257, "grad_norm": 1.0792579650878906, "learning_rate": 0.0002, "epoch": 6.693227091633466, "step": 7560}, {"loss": 1.1466, "grad_norm": 1.1826539039611816, "learning_rate": 0.0002, "epoch": 6.702080566622399, "step": 7570}, {"loss": 1.0874, "grad_norm": 1.1485552787780762, "learning_rate": 0.0002, "epoch": 6.710934041611332, "step": 7580}, {"loss": 1.0502, "grad_norm": 1.090723991394043, "learning_rate": 0.0002, "epoch": 6.719787516600266, "step": 7590}, {"loss": 1.0627, "grad_norm": 1.105883002281189, "learning_rate": 0.0002, "epoch": 6.728640991589199, "step": 7600}, {"loss": 1.1101, "grad_norm": 1.3093862533569336, "learning_rate": 0.0002, "epoch": 6.737494466578132, "step": 7610}, {"loss": 1.1202, "grad_norm": 1.0273808240890503, "learning_rate": 0.0002, "epoch": 6.746347941567065, "step": 7620}, {"loss": 1.2071, "grad_norm": 1.3253363370895386, "learning_rate": 0.0002, "epoch": 6.755201416555998, "step": 7630}, {"loss": 1.0833, "grad_norm": 1.1979365348815918, "learning_rate": 0.0002, "epoch": 6.764054891544931, "step": 7640}, {"loss": 1.1208, "grad_norm": 1.123506784439087, "learning_rate": 0.0002, "epoch": 6.772908366533865, "step": 7650}, {"loss": 1.2111, "grad_norm": 1.3928422927856445, "learning_rate": 0.0002, "epoch": 6.781761841522798, "step": 7660}, {"loss": 1.1535, "grad_norm": 1.1540825366973877, "learning_rate": 0.0002, "epoch": 6.790615316511731, "step": 7670}, {"loss": 1.1053, "grad_norm": 1.0836732387542725, "learning_rate": 0.0002, "epoch": 6.799468791500664, "step": 7680}, {"loss": 1.1049, "grad_norm": 1.0360240936279297, "learning_rate": 0.0002, "epoch": 6.808322266489597, "step": 7690}, {"loss": 1.1819, "grad_norm": 1.2440129518508911, "learning_rate": 0.0002, "epoch": 6.817175741478531, "step": 7700}, {"loss": 1.1245, "grad_norm": 1.1702594757080078, "learning_rate": 0.0002, "epoch": 6.826029216467464, "step": 7710}, {"loss": 1.1021, "grad_norm": 1.0726280212402344, "learning_rate": 0.0002, "epoch": 6.8348826914563965, "step": 7720}, {"loss": 1.1471, "grad_norm": 0.9410907030105591, "learning_rate": 0.0002, "epoch": 6.84373616644533, "step": 7730}, {"loss": 1.1616, "grad_norm": 1.042914867401123, "learning_rate": 0.0002, "epoch": 6.852589641434263, "step": 7740}, {"loss": 1.215, "grad_norm": 1.1028170585632324, "learning_rate": 0.0002, "epoch": 6.861443116423196, "step": 7750}, {"loss": 1.0759, "grad_norm": 1.0990355014801025, "learning_rate": 0.0002, "epoch": 6.87029659141213, "step": 7760}, {"loss": 1.1508, "grad_norm": 1.2572479248046875, "learning_rate": 0.0002, "epoch": 6.8791500664010625, "step": 7770}, {"loss": 1.1749, "grad_norm": 1.250198483467102, "learning_rate": 0.0002, "epoch": 6.888003541389995, "step": 7780}, {"loss": 1.1299, "grad_norm": 1.1872532367706299, "learning_rate": 0.0002, "epoch": 6.896857016378929, "step": 7790}, {"loss": 1.129, "grad_norm": 1.5275602340698242, "learning_rate": 0.0002, "epoch": 6.905710491367862, "step": 7800}, {"loss": 1.0712, "grad_norm": 1.015166163444519, "learning_rate": 0.0002, "epoch": 6.914563966356795, "step": 7810}, {"loss": 1.1931, "grad_norm": 1.3205344676971436, "learning_rate": 0.0002, "epoch": 6.923417441345729, "step": 7820}, {"loss": 1.222, "grad_norm": 1.1329596042633057, "learning_rate": 0.0002, "epoch": 6.932270916334661, "step": 7830}, {"loss": 1.1207, "grad_norm": 1.1614333391189575, "learning_rate": 0.0002, "epoch": 6.941124391323594, "step": 7840}, {"loss": 1.2127, "grad_norm": 1.3472208976745605, "learning_rate": 0.0002, "epoch": 6.949977866312528, "step": 7850}, {"loss": 1.1002, "grad_norm": 1.1490193605422974, "learning_rate": 0.0002, "epoch": 6.958831341301461, "step": 7860}, {"loss": 1.1362, "grad_norm": 1.1343097686767578, "learning_rate": 0.0002, "epoch": 6.967684816290394, "step": 7870}, {"loss": 1.1622, "grad_norm": 1.2555341720581055, "learning_rate": 0.0002, "epoch": 6.9765382912793275, "step": 7880}, {"loss": 1.0955, "grad_norm": 1.2695735692977905, "learning_rate": 0.0002, "epoch": 6.98539176626826, "step": 7890}, {"loss": 1.1718, "grad_norm": 1.1662464141845703, "learning_rate": 0.0002, "epoch": 6.994245241257193, "step": 7900}]} +{"epoch": 7.996458610004427, "step": 9032, "epoch_duration": 2938.937849998474, "total_accumulated_duration": 23595.457686662674, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-10000/checkpoint-2259", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4916, "grad_norm": 0.4775333106517792, "learning_rate": 0.0002, "epoch": 0.008853474988933156, "step": 10}, {"loss": 2.3137, "grad_norm": 0.5485824346542358, "learning_rate": 0.0002, "epoch": 0.017706949977866312, "step": 20}, {"loss": 2.0984, "grad_norm": 0.5675218105316162, "learning_rate": 0.0002, "epoch": 0.02656042496679947, "step": 30}, {"loss": 2.0622, "grad_norm": 0.696494460105896, "learning_rate": 0.0002, "epoch": 0.035413899955732624, "step": 40}, {"loss": 1.9547, "grad_norm": 0.4788398742675781, "learning_rate": 0.0002, "epoch": 0.04426737494466578, "step": 50}, {"loss": 1.8722, "grad_norm": 0.4763128161430359, "learning_rate": 0.0002, "epoch": 0.05312084993359894, "step": 60}, {"loss": 1.8632, "grad_norm": 0.5929698348045349, "learning_rate": 0.0002, "epoch": 0.0619743249225321, "step": 70}, {"loss": 1.9573, "grad_norm": 0.5899396538734436, "learning_rate": 0.0002, "epoch": 0.07082779991146525, "step": 80}, {"loss": 1.8308, "grad_norm": 0.460123747587204, "learning_rate": 0.0002, "epoch": 0.0796812749003984, "step": 90}, {"loss": 1.7615, "grad_norm": 0.4184812009334564, "learning_rate": 0.0002, "epoch": 0.08853474988933156, "step": 100}, {"loss": 1.8079, "grad_norm": 0.4051891267299652, "learning_rate": 0.0002, "epoch": 0.09738822487826472, "step": 110}, {"loss": 1.8911, "grad_norm": 0.3709661066532135, "learning_rate": 0.0002, "epoch": 0.10624169986719788, "step": 120}, {"loss": 1.8695, "grad_norm": 0.4783487915992737, "learning_rate": 0.0002, "epoch": 0.11509517485613104, "step": 130}, {"loss": 1.8602, "grad_norm": 0.36478137969970703, "learning_rate": 0.0002, "epoch": 0.1239486498450642, "step": 140}, {"loss": 1.7814, "grad_norm": 0.4005294442176819, "learning_rate": 0.0002, "epoch": 0.13280212483399734, "step": 150}, {"loss": 1.799, "grad_norm": 0.42357513308525085, "learning_rate": 0.0002, "epoch": 0.1416555998229305, "step": 160}, {"loss": 1.8835, "grad_norm": 0.3913971781730652, "learning_rate": 0.0002, "epoch": 0.15050907481186365, "step": 170}, {"loss": 1.8507, "grad_norm": 0.4650019407272339, "learning_rate": 0.0002, "epoch": 0.1593625498007968, "step": 180}, {"loss": 1.8036, "grad_norm": 0.5545958876609802, "learning_rate": 0.0002, "epoch": 0.16821602478972997, "step": 190}, {"loss": 1.8676, "grad_norm": 0.3669356107711792, "learning_rate": 0.0002, "epoch": 0.17706949977866313, "step": 200}, {"loss": 1.8169, "grad_norm": 0.3683622181415558, "learning_rate": 0.0002, "epoch": 0.18592297476759628, "step": 210}, {"loss": 1.8117, "grad_norm": 0.39825671911239624, "learning_rate": 0.0002, "epoch": 0.19477644975652944, "step": 220}, {"loss": 1.8332, "grad_norm": 0.4298318326473236, "learning_rate": 0.0002, "epoch": 0.2036299247454626, "step": 230}, {"loss": 1.8339, "grad_norm": 0.36111244559288025, "learning_rate": 0.0002, "epoch": 0.21248339973439576, "step": 240}, {"loss": 1.78, "grad_norm": 0.3711858093738556, "learning_rate": 0.0002, "epoch": 0.2213368747233289, "step": 250}, {"loss": 1.8643, "grad_norm": 0.37717559933662415, "learning_rate": 0.0002, "epoch": 0.23019034971226207, "step": 260}, {"loss": 1.7683, "grad_norm": 0.3678877651691437, "learning_rate": 0.0002, "epoch": 0.23904382470119523, "step": 270}, {"loss": 1.8235, "grad_norm": 0.4165912866592407, "learning_rate": 0.0002, "epoch": 0.2478972996901284, "step": 280}, {"loss": 1.8033, "grad_norm": 0.3403240740299225, "learning_rate": 0.0002, "epoch": 0.25675077467906154, "step": 290}, {"loss": 1.8704, "grad_norm": 0.4023234248161316, "learning_rate": 0.0002, "epoch": 0.2656042496679947, "step": 300}, {"loss": 1.7721, "grad_norm": 0.32472360134124756, "learning_rate": 0.0002, "epoch": 0.27445772465692786, "step": 310}, {"loss": 1.8544, "grad_norm": 0.36464595794677734, "learning_rate": 0.0002, "epoch": 0.283311199645861, "step": 320}, {"loss": 1.8168, "grad_norm": 0.3868598937988281, "learning_rate": 0.0002, "epoch": 0.2921646746347942, "step": 330}, {"loss": 1.772, "grad_norm": 0.3123539686203003, "learning_rate": 0.0002, "epoch": 0.3010181496237273, "step": 340}, {"loss": 1.8285, "grad_norm": 0.3392639458179474, "learning_rate": 0.0002, "epoch": 0.3098716246126605, "step": 350}, {"loss": 1.806, "grad_norm": 0.42070651054382324, "learning_rate": 0.0002, "epoch": 0.3187250996015936, "step": 360}, {"loss": 1.8319, "grad_norm": 0.3650900423526764, "learning_rate": 0.0002, "epoch": 0.3275785745905268, "step": 370}, {"loss": 1.8388, "grad_norm": 0.41388973593711853, "learning_rate": 0.0002, "epoch": 0.33643204957945994, "step": 380}, {"loss": 1.79, "grad_norm": 0.36625272035598755, "learning_rate": 0.0002, "epoch": 0.3452855245683931, "step": 390}, {"loss": 1.8271, "grad_norm": 0.3930284082889557, "learning_rate": 0.0002, "epoch": 0.35413899955732625, "step": 400}, {"loss": 1.8664, "grad_norm": 0.3415820300579071, "learning_rate": 0.0002, "epoch": 0.3629924745462594, "step": 410}, {"loss": 1.8885, "grad_norm": 0.4256570041179657, "learning_rate": 0.0002, "epoch": 0.37184594953519257, "step": 420}, {"loss": 1.7728, "grad_norm": 0.3740842938423157, "learning_rate": 0.0002, "epoch": 0.3806994245241257, "step": 430}, {"loss": 1.7676, "grad_norm": 0.334108829498291, "learning_rate": 0.0002, "epoch": 0.3895528995130589, "step": 440}, {"loss": 1.7837, "grad_norm": 0.33186739683151245, "learning_rate": 0.0002, "epoch": 0.398406374501992, "step": 450}, {"loss": 1.8885, "grad_norm": 0.39127954840660095, "learning_rate": 0.0002, "epoch": 0.4072598494909252, "step": 460}, {"loss": 1.8053, "grad_norm": 0.331443727016449, "learning_rate": 0.0002, "epoch": 0.4161133244798583, "step": 470}, {"loss": 1.783, "grad_norm": 0.36834150552749634, "learning_rate": 0.0002, "epoch": 0.4249667994687915, "step": 480}, {"loss": 1.7549, "grad_norm": 0.338123619556427, "learning_rate": 0.0002, "epoch": 0.43382027445772464, "step": 490}, {"loss": 1.795, "grad_norm": 0.3891060948371887, "learning_rate": 0.0002, "epoch": 0.4426737494466578, "step": 500}, {"loss": 1.7639, "grad_norm": 0.3486529290676117, "learning_rate": 0.0002, "epoch": 0.45152722443559096, "step": 510}, {"loss": 1.796, "grad_norm": 0.3635135889053345, "learning_rate": 0.0002, "epoch": 0.46038069942452414, "step": 520}, {"loss": 1.8068, "grad_norm": 0.7706693410873413, "learning_rate": 0.0002, "epoch": 0.4692341744134573, "step": 530}, {"loss": 1.8048, "grad_norm": 0.33725443482398987, "learning_rate": 0.0002, "epoch": 0.47808764940239046, "step": 540}, {"loss": 1.8023, "grad_norm": 0.3127504289150238, "learning_rate": 0.0002, "epoch": 0.4869411243913236, "step": 550}, {"loss": 1.7693, "grad_norm": 0.3527977466583252, "learning_rate": 0.0002, "epoch": 0.4957945993802568, "step": 560}, {"loss": 1.7989, "grad_norm": 0.3574548661708832, "learning_rate": 0.0002, "epoch": 0.5046480743691899, "step": 570}, {"loss": 1.7699, "grad_norm": 0.32787248492240906, "learning_rate": 0.0002, "epoch": 0.5135015493581231, "step": 580}, {"loss": 1.7502, "grad_norm": 0.3309430778026581, "learning_rate": 0.0002, "epoch": 0.5223550243470563, "step": 590}, {"loss": 1.7798, "grad_norm": 0.34276407957077026, "learning_rate": 0.0002, "epoch": 0.5312084993359893, "step": 600}, {"loss": 1.7517, "grad_norm": 0.3343711495399475, "learning_rate": 0.0002, "epoch": 0.5400619743249225, "step": 610}, {"loss": 1.7661, "grad_norm": 0.3193040192127228, "learning_rate": 0.0002, "epoch": 0.5489154493138557, "step": 620}, {"loss": 1.7769, "grad_norm": 0.3059828579425812, "learning_rate": 0.0002, "epoch": 0.5577689243027888, "step": 630}, {"loss": 1.8166, "grad_norm": 0.37237173318862915, "learning_rate": 0.0002, "epoch": 0.566622399291722, "step": 640}, {"loss": 1.7531, "grad_norm": 0.36022549867630005, "learning_rate": 0.0002, "epoch": 0.5754758742806552, "step": 650}, {"loss": 1.771, "grad_norm": 0.34974920749664307, "learning_rate": 0.0002, "epoch": 0.5843293492695883, "step": 660}, {"loss": 1.8226, "grad_norm": 0.37135401368141174, "learning_rate": 0.0002, "epoch": 0.5931828242585214, "step": 670}, {"loss": 1.7456, "grad_norm": 0.3385699689388275, "learning_rate": 0.0002, "epoch": 0.6020362992474546, "step": 680}, {"loss": 1.7696, "grad_norm": 0.36015814542770386, "learning_rate": 0.0002, "epoch": 0.6108897742363878, "step": 690}, {"loss": 1.7892, "grad_norm": 0.3503795564174652, "learning_rate": 0.0002, "epoch": 0.619743249225321, "step": 700}, {"loss": 1.7733, "grad_norm": 0.3447190225124359, "learning_rate": 0.0002, "epoch": 0.628596724214254, "step": 710}, {"loss": 1.794, "grad_norm": 0.3193499445915222, "learning_rate": 0.0002, "epoch": 0.6374501992031872, "step": 720}, {"loss": 1.8046, "grad_norm": 0.37058180570602417, "learning_rate": 0.0002, "epoch": 0.6463036741921204, "step": 730}, {"loss": 1.8391, "grad_norm": 0.42216411232948303, "learning_rate": 0.0002, "epoch": 0.6551571491810536, "step": 740}, {"loss": 1.7142, "grad_norm": 0.3091185688972473, "learning_rate": 0.0002, "epoch": 0.6640106241699867, "step": 750}, {"loss": 1.8624, "grad_norm": 0.33168601989746094, "learning_rate": 0.0002, "epoch": 0.6728640991589199, "step": 760}, {"loss": 1.7123, "grad_norm": 0.31269341707229614, "learning_rate": 0.0002, "epoch": 0.6817175741478531, "step": 770}, {"loss": 1.8526, "grad_norm": 0.36125293374061584, "learning_rate": 0.0002, "epoch": 0.6905710491367862, "step": 780}, {"loss": 1.7478, "grad_norm": 0.3145293593406677, "learning_rate": 0.0002, "epoch": 0.6994245241257193, "step": 790}, {"loss": 1.6545, "grad_norm": 0.3611990809440613, "learning_rate": 0.0002, "epoch": 0.7082779991146525, "step": 800}, {"loss": 1.892, "grad_norm": 0.3165971636772156, "learning_rate": 0.0002, "epoch": 0.7171314741035857, "step": 810}, {"loss": 1.8251, "grad_norm": 0.3364323675632477, "learning_rate": 0.0002, "epoch": 0.7259849490925188, "step": 820}, {"loss": 1.8508, "grad_norm": 0.4310600757598877, "learning_rate": 0.0002, "epoch": 0.734838424081452, "step": 830}, {"loss": 1.7816, "grad_norm": 0.3414389491081238, "learning_rate": 0.0002, "epoch": 0.7436918990703851, "step": 840}, {"loss": 1.8148, "grad_norm": 0.35536202788352966, "learning_rate": 0.0002, "epoch": 0.7525453740593183, "step": 850}, {"loss": 1.8241, "grad_norm": 0.3232460618019104, "learning_rate": 0.0002, "epoch": 0.7613988490482514, "step": 860}, {"loss": 1.7312, "grad_norm": 0.32734858989715576, "learning_rate": 0.0002, "epoch": 0.7702523240371846, "step": 870}, {"loss": 1.7241, "grad_norm": 0.3433493673801422, "learning_rate": 0.0002, "epoch": 0.7791057990261178, "step": 880}, {"loss": 1.7375, "grad_norm": 0.33354780077934265, "learning_rate": 0.0002, "epoch": 0.787959274015051, "step": 890}, {"loss": 1.7314, "grad_norm": 0.30728545784950256, "learning_rate": 0.0002, "epoch": 0.796812749003984, "step": 900}, {"loss": 1.8267, "grad_norm": 0.3373030126094818, "learning_rate": 0.0002, "epoch": 0.8056662239929172, "step": 910}, {"loss": 1.8479, "grad_norm": 0.3468782603740692, "learning_rate": 0.0002, "epoch": 0.8145196989818504, "step": 920}, {"loss": 1.8548, "grad_norm": 0.33520200848579407, "learning_rate": 0.0002, "epoch": 0.8233731739707836, "step": 930}, {"loss": 1.7932, "grad_norm": 0.35207098722457886, "learning_rate": 0.0002, "epoch": 0.8322266489597167, "step": 940}, {"loss": 1.7804, "grad_norm": 0.4000207483768463, "learning_rate": 0.0002, "epoch": 0.8410801239486498, "step": 950}, {"loss": 1.7996, "grad_norm": 0.35362836718559265, "learning_rate": 0.0002, "epoch": 0.849933598937583, "step": 960}, {"loss": 1.7497, "grad_norm": 0.3470745086669922, "learning_rate": 0.0002, "epoch": 0.8587870739265162, "step": 970}, {"loss": 1.8174, "grad_norm": 0.31602704524993896, "learning_rate": 0.0002, "epoch": 0.8676405489154493, "step": 980}, {"loss": 1.7734, "grad_norm": 0.3062942326068878, "learning_rate": 0.0002, "epoch": 0.8764940239043825, "step": 990}, {"loss": 1.7804, "grad_norm": 0.36963850259780884, "learning_rate": 0.0002, "epoch": 0.8853474988933157, "step": 1000}, {"loss": 1.7309, "grad_norm": 0.3384034037590027, "learning_rate": 0.0002, "epoch": 0.8942009738822487, "step": 1010}, {"loss": 1.7945, "grad_norm": 0.30436110496520996, "learning_rate": 0.0002, "epoch": 0.9030544488711819, "step": 1020}, {"loss": 1.7126, "grad_norm": 3.499784469604492, "learning_rate": 0.0002, "epoch": 0.9119079238601151, "step": 1030}, {"loss": 1.7847, "grad_norm": 0.3130280375480652, "learning_rate": 0.0002, "epoch": 0.9207613988490483, "step": 1040}, {"loss": 1.7527, "grad_norm": 0.29976674914360046, "learning_rate": 0.0002, "epoch": 0.9296148738379814, "step": 1050}, {"loss": 1.7753, "grad_norm": 0.35852617025375366, "learning_rate": 0.0002, "epoch": 0.9384683488269145, "step": 1060}, {"loss": 1.7507, "grad_norm": 0.3288591504096985, "learning_rate": 0.0002, "epoch": 0.9473218238158477, "step": 1070}, {"loss": 1.8155, "grad_norm": 0.32641634345054626, "learning_rate": 0.0002, "epoch": 0.9561752988047809, "step": 1080}, {"loss": 1.7912, "grad_norm": 0.3305715322494507, "learning_rate": 0.0002, "epoch": 0.965028773793714, "step": 1090}, {"loss": 1.8368, "grad_norm": 0.30650773644447327, "learning_rate": 0.0002, "epoch": 0.9738822487826472, "step": 1100}, {"loss": 1.6739, "grad_norm": 0.3330624997615814, "learning_rate": 0.0002, "epoch": 0.9827357237715804, "step": 1110}, {"loss": 1.8392, "grad_norm": 0.3173314034938812, "learning_rate": 0.0002, "epoch": 0.9915891987605135, "step": 1120}, {"eval_loss": 1.8095673322677612, "eval_runtime": 82.6312, "eval_samples_per_second": 6.233, "eval_steps_per_second": 0.787, "epoch": 0.9995573262505534, "step": 1129}, {"loss": 1.7997, "grad_norm": 0.3092995882034302, "learning_rate": 0.0002, "epoch": 1.0004426737494467, "step": 1130}, {"loss": 1.6958, "grad_norm": 0.34386494755744934, "learning_rate": 0.0002, "epoch": 1.0092961487383798, "step": 1140}, {"loss": 1.7149, "grad_norm": 0.2887897789478302, "learning_rate": 0.0002, "epoch": 1.0181496237273129, "step": 1150}, {"loss": 1.7377, "grad_norm": 0.3706893026828766, "learning_rate": 0.0002, "epoch": 1.0270030987162462, "step": 1160}, {"loss": 1.6604, "grad_norm": 0.34724316000938416, "learning_rate": 0.0002, "epoch": 1.0358565737051793, "step": 1170}, {"loss": 1.7749, "grad_norm": 0.41001757979393005, "learning_rate": 0.0002, "epoch": 1.0447100486941125, "step": 1180}, {"loss": 1.6332, "grad_norm": 0.34838348627090454, "learning_rate": 0.0002, "epoch": 1.0535635236830456, "step": 1190}, {"loss": 1.7416, "grad_norm": 0.37201181054115295, "learning_rate": 0.0002, "epoch": 1.0624169986719787, "step": 1200}, {"loss": 1.7707, "grad_norm": 0.36871352791786194, "learning_rate": 0.0002, "epoch": 1.071270473660912, "step": 1210}, {"loss": 1.6769, "grad_norm": 0.35687458515167236, "learning_rate": 0.0002, "epoch": 1.080123948649845, "step": 1220}, {"loss": 1.7235, "grad_norm": 0.3864741921424866, "learning_rate": 0.0002, "epoch": 1.0889774236387781, "step": 1230}, {"loss": 1.729, "grad_norm": 0.3496808707714081, "learning_rate": 0.0002, "epoch": 1.0978308986277114, "step": 1240}, {"loss": 1.7192, "grad_norm": 0.3444930911064148, "learning_rate": 0.0002, "epoch": 1.1066843736166445, "step": 1250}, {"loss": 1.6672, "grad_norm": 0.353188693523407, "learning_rate": 0.0002, "epoch": 1.1155378486055776, "step": 1260}, {"loss": 1.7634, "grad_norm": 0.3284400999546051, "learning_rate": 0.0002, "epoch": 1.1243913235945109, "step": 1270}, {"loss": 1.7441, "grad_norm": 0.3545348644256592, "learning_rate": 0.0002, "epoch": 1.133244798583444, "step": 1280}, {"loss": 1.7343, "grad_norm": 0.3489900529384613, "learning_rate": 0.0002, "epoch": 1.1420982735723773, "step": 1290}, {"loss": 1.6399, "grad_norm": 0.40355560183525085, "learning_rate": 0.0002, "epoch": 1.1509517485613103, "step": 1300}, {"loss": 1.7658, "grad_norm": 0.3369944095611572, "learning_rate": 0.0002, "epoch": 1.1598052235502434, "step": 1310}, {"loss": 1.7098, "grad_norm": 0.39141345024108887, "learning_rate": 0.0002, "epoch": 1.1686586985391767, "step": 1320}, {"loss": 1.6628, "grad_norm": 0.36518552899360657, "learning_rate": 0.0002, "epoch": 1.1775121735281098, "step": 1330}, {"loss": 1.6958, "grad_norm": 0.3730056583881378, "learning_rate": 0.0002, "epoch": 1.1863656485170428, "step": 1340}, {"loss": 1.7613, "grad_norm": 0.37711501121520996, "learning_rate": 0.0002, "epoch": 1.1952191235059761, "step": 1350}, {"loss": 1.6423, "grad_norm": 0.3627128005027771, "learning_rate": 0.0002, "epoch": 1.2040725984949092, "step": 1360}, {"loss": 1.7214, "grad_norm": 0.3458651006221771, "learning_rate": 0.0002, "epoch": 1.2129260734838425, "step": 1370}, {"loss": 1.6978, "grad_norm": 0.392395555973053, "learning_rate": 0.0002, "epoch": 1.2217795484727756, "step": 1380}, {"loss": 1.7785, "grad_norm": 0.3353286683559418, "learning_rate": 0.0002, "epoch": 1.2306330234617087, "step": 1390}, {"loss": 1.7019, "grad_norm": 0.9545007944107056, "learning_rate": 0.0002, "epoch": 1.239486498450642, "step": 1400}, {"loss": 1.725, "grad_norm": 0.37037935853004456, "learning_rate": 0.0002, "epoch": 1.248339973439575, "step": 1410}, {"loss": 1.6818, "grad_norm": 0.3831497132778168, "learning_rate": 0.0002, "epoch": 1.257193448428508, "step": 1420}, {"loss": 1.747, "grad_norm": 0.4633576273918152, "learning_rate": 0.0002, "epoch": 1.2660469234174414, "step": 1430}, {"loss": 1.6864, "grad_norm": 0.3690567910671234, "learning_rate": 0.0002, "epoch": 1.2749003984063745, "step": 1440}, {"loss": 1.767, "grad_norm": 0.33980098366737366, "learning_rate": 0.0002, "epoch": 1.2837538733953076, "step": 1450}, {"loss": 1.6989, "grad_norm": 0.3731277287006378, "learning_rate": 0.0002, "epoch": 1.2926073483842409, "step": 1460}, {"loss": 1.6801, "grad_norm": 0.3781551122665405, "learning_rate": 0.0002, "epoch": 1.301460823373174, "step": 1470}, {"loss": 1.7551, "grad_norm": 0.36511561274528503, "learning_rate": 0.0002, "epoch": 1.310314298362107, "step": 1480}, {"loss": 1.6629, "grad_norm": 0.3292245864868164, "learning_rate": 0.0002, "epoch": 1.3191677733510403, "step": 1490}, {"loss": 1.7098, "grad_norm": 0.38758566975593567, "learning_rate": 0.0002, "epoch": 1.3280212483399734, "step": 1500}, {"loss": 1.7364, "grad_norm": 0.3993414044380188, "learning_rate": 0.0002, "epoch": 1.3368747233289067, "step": 1510}, {"loss": 1.7202, "grad_norm": 0.35689303278923035, "learning_rate": 0.0002, "epoch": 1.3457281983178397, "step": 1520}, {"loss": 1.7082, "grad_norm": 0.41849321126937866, "learning_rate": 0.0002, "epoch": 1.354581673306773, "step": 1530}, {"loss": 1.7488, "grad_norm": 0.36752554774284363, "learning_rate": 0.0002, "epoch": 1.3634351482957061, "step": 1540}, {"loss": 1.7032, "grad_norm": 0.36915940046310425, "learning_rate": 0.0002, "epoch": 1.3722886232846392, "step": 1550}, {"loss": 1.6698, "grad_norm": 0.3656710386276245, "learning_rate": 0.0002, "epoch": 1.3811420982735725, "step": 1560}, {"loss": 1.7269, "grad_norm": 0.32055532932281494, "learning_rate": 0.0002, "epoch": 1.3899955732625056, "step": 1570}, {"loss": 1.8, "grad_norm": 0.35031241178512573, "learning_rate": 0.0002, "epoch": 1.3988490482514386, "step": 1580}, {"loss": 1.6667, "grad_norm": 0.44541189074516296, "learning_rate": 0.0002, "epoch": 1.407702523240372, "step": 1590}, {"loss": 1.8624, "grad_norm": 0.36922356486320496, "learning_rate": 0.0002, "epoch": 1.416555998229305, "step": 1600}, {"loss": 1.7011, "grad_norm": 0.3470565974712372, "learning_rate": 0.0002, "epoch": 1.425409473218238, "step": 1610}, {"loss": 1.6912, "grad_norm": 0.3743111193180084, "learning_rate": 0.0002, "epoch": 1.4342629482071714, "step": 1620}, {"loss": 1.752, "grad_norm": 0.3619250953197479, "learning_rate": 0.0002, "epoch": 1.4431164231961044, "step": 1630}, {"loss": 1.6919, "grad_norm": 0.4028145968914032, "learning_rate": 0.0002, "epoch": 1.4519698981850375, "step": 1640}, {"loss": 1.75, "grad_norm": 0.36065351963043213, "learning_rate": 0.0002, "epoch": 1.4608233731739708, "step": 1650}, {"loss": 1.8212, "grad_norm": 0.44304442405700684, "learning_rate": 0.0002, "epoch": 1.469676848162904, "step": 1660}, {"loss": 1.6691, "grad_norm": 0.35770007967948914, "learning_rate": 0.0002, "epoch": 1.478530323151837, "step": 1670}, {"loss": 1.7588, "grad_norm": 0.37584400177001953, "learning_rate": 0.0002, "epoch": 1.4873837981407703, "step": 1680}, {"loss": 1.63, "grad_norm": 0.37151241302490234, "learning_rate": 0.0002, "epoch": 1.4962372731297033, "step": 1690}, {"loss": 1.6675, "grad_norm": 0.36422812938690186, "learning_rate": 0.0002, "epoch": 1.5050907481186364, "step": 1700}, {"loss": 1.7045, "grad_norm": 0.3680015206336975, "learning_rate": 0.0002, "epoch": 1.5139442231075697, "step": 1710}, {"loss": 1.6917, "grad_norm": 0.3356926441192627, "learning_rate": 0.0002, "epoch": 1.522797698096503, "step": 1720}, {"loss": 1.7108, "grad_norm": 0.37887054681777954, "learning_rate": 0.0002, "epoch": 1.531651173085436, "step": 1730}, {"loss": 1.7001, "grad_norm": 0.37052762508392334, "learning_rate": 0.0002, "epoch": 1.5405046480743692, "step": 1740}, {"loss": 1.6677, "grad_norm": 0.333925724029541, "learning_rate": 0.0002, "epoch": 1.5493581230633025, "step": 1750}, {"loss": 1.7159, "grad_norm": 0.3722778558731079, "learning_rate": 0.0002, "epoch": 1.5582115980522355, "step": 1760}, {"loss": 1.6923, "grad_norm": 0.3331141173839569, "learning_rate": 0.0002, "epoch": 1.5670650730411686, "step": 1770}, {"loss": 1.7444, "grad_norm": 0.3670045733451843, "learning_rate": 0.0002, "epoch": 1.575918548030102, "step": 1780}, {"loss": 1.7092, "grad_norm": 0.3769885301589966, "learning_rate": 0.0002, "epoch": 1.584772023019035, "step": 1790}, {"loss": 1.6689, "grad_norm": 0.4266890287399292, "learning_rate": 0.0002, "epoch": 1.593625498007968, "step": 1800}, {"loss": 1.6859, "grad_norm": 0.37174347043037415, "learning_rate": 0.0002, "epoch": 1.6024789729969013, "step": 1810}, {"loss": 1.6793, "grad_norm": 0.3599846363067627, "learning_rate": 0.0002, "epoch": 1.6113324479858344, "step": 1820}, {"loss": 1.6836, "grad_norm": 0.3364820182323456, "learning_rate": 0.0002, "epoch": 1.6201859229747675, "step": 1830}, {"loss": 1.7278, "grad_norm": 0.3874799907207489, "learning_rate": 0.0002, "epoch": 1.6290393979637008, "step": 1840}, {"loss": 1.705, "grad_norm": 0.3706085681915283, "learning_rate": 0.0002, "epoch": 1.6378928729526339, "step": 1850}, {"loss": 1.6761, "grad_norm": 0.3997809886932373, "learning_rate": 0.0002, "epoch": 1.646746347941567, "step": 1860}, {"loss": 1.7983, "grad_norm": 0.4033166170120239, "learning_rate": 0.0002, "epoch": 1.6555998229305002, "step": 1870}, {"loss": 1.6518, "grad_norm": 0.3944370150566101, "learning_rate": 0.0002, "epoch": 1.6644532979194335, "step": 1880}, {"loss": 1.6017, "grad_norm": 0.3467825651168823, "learning_rate": 0.0002, "epoch": 1.6733067729083664, "step": 1890}, {"loss": 1.7462, "grad_norm": 0.35290950536727905, "learning_rate": 0.0002, "epoch": 1.6821602478972997, "step": 1900}, {"loss": 1.7634, "grad_norm": 0.3664521872997284, "learning_rate": 0.0002, "epoch": 1.691013722886233, "step": 1910}, {"loss": 1.7922, "grad_norm": 0.33863595128059387, "learning_rate": 0.0002, "epoch": 1.699867197875166, "step": 1920}, {"loss": 1.7048, "grad_norm": 0.34726113080978394, "learning_rate": 0.0002, "epoch": 1.7087206728640991, "step": 1930}, {"loss": 1.6664, "grad_norm": 0.35060688853263855, "learning_rate": 0.0002, "epoch": 1.7175741478530324, "step": 1940}, {"loss": 1.7577, "grad_norm": 0.33741647005081177, "learning_rate": 0.0002, "epoch": 1.7264276228419655, "step": 1950}, {"loss": 1.6971, "grad_norm": 0.36190304160118103, "learning_rate": 0.0002, "epoch": 1.7352810978308986, "step": 1960}, {"loss": 1.7238, "grad_norm": 0.3412845730781555, "learning_rate": 0.0002, "epoch": 1.7441345728198319, "step": 1970}, {"loss": 1.7038, "grad_norm": 0.3841935694217682, "learning_rate": 0.0002, "epoch": 1.752988047808765, "step": 1980}, {"loss": 1.7185, "grad_norm": 0.39062076807022095, "learning_rate": 0.0002, "epoch": 1.761841522797698, "step": 1990}, {"loss": 1.7346, "grad_norm": 0.3741697669029236, "learning_rate": 0.0002, "epoch": 1.7706949977866313, "step": 2000}, {"loss": 1.6864, "grad_norm": 0.4160231053829193, "learning_rate": 0.0002, "epoch": 1.7795484727755644, "step": 2010}, {"loss": 1.7572, "grad_norm": 0.3602111339569092, "learning_rate": 0.0002, "epoch": 1.7884019477644975, "step": 2020}, {"loss": 1.6139, "grad_norm": 0.36740878224372864, "learning_rate": 0.0002, "epoch": 1.7972554227534308, "step": 2030}, {"loss": 1.7043, "grad_norm": 0.419039249420166, "learning_rate": 0.0002, "epoch": 1.8061088977423638, "step": 2040}, {"loss": 1.7847, "grad_norm": 0.3511838912963867, "learning_rate": 0.0002, "epoch": 1.814962372731297, "step": 2050}, {"loss": 1.6477, "grad_norm": 0.3580166697502136, "learning_rate": 0.0002, "epoch": 1.8238158477202302, "step": 2060}, {"loss": 1.7562, "grad_norm": 0.40928223729133606, "learning_rate": 0.0002, "epoch": 1.8326693227091635, "step": 2070}, {"loss": 1.7356, "grad_norm": 0.37134310603141785, "learning_rate": 0.0002, "epoch": 1.8415227976980963, "step": 2080}, {"loss": 1.6829, "grad_norm": 0.3924112319946289, "learning_rate": 0.0002, "epoch": 1.8503762726870296, "step": 2090}, {"loss": 1.6785, "grad_norm": 0.3215042054653168, "learning_rate": 0.0002, "epoch": 1.859229747675963, "step": 2100}, {"loss": 1.6864, "grad_norm": 0.37674015760421753, "learning_rate": 0.0002, "epoch": 1.868083222664896, "step": 2110}, {"loss": 1.7313, "grad_norm": 0.370856374502182, "learning_rate": 0.0002, "epoch": 1.876936697653829, "step": 2120}, {"loss": 1.7163, "grad_norm": 0.35783782601356506, "learning_rate": 0.0002, "epoch": 1.8857901726427624, "step": 2130}, {"loss": 1.7655, "grad_norm": 0.39538058638572693, "learning_rate": 0.0002, "epoch": 1.8946436476316955, "step": 2140}, {"loss": 1.6614, "grad_norm": 0.36677780747413635, "learning_rate": 0.0002, "epoch": 1.9034971226206285, "step": 2150}, {"loss": 1.6959, "grad_norm": 0.39032700657844543, "learning_rate": 0.0002, "epoch": 1.9123505976095618, "step": 2160}, {"loss": 1.7643, "grad_norm": 0.39762043952941895, "learning_rate": 0.0002, "epoch": 1.921204072598495, "step": 2170}, {"loss": 1.6767, "grad_norm": 0.5400257110595703, "learning_rate": 0.0002, "epoch": 1.930057547587428, "step": 2180}, {"loss": 1.7262, "grad_norm": 0.3650212287902832, "learning_rate": 0.0002, "epoch": 1.9389110225763613, "step": 2190}, {"loss": 1.7027, "grad_norm": 0.3583165109157562, "learning_rate": 0.0002, "epoch": 1.9477644975652944, "step": 2200}, {"loss": 1.7241, "grad_norm": 0.4031282365322113, "learning_rate": 0.0002, "epoch": 1.9566179725542274, "step": 2210}, {"loss": 1.7617, "grad_norm": 0.3673221170902252, "learning_rate": 0.0002, "epoch": 1.9654714475431607, "step": 2220}, {"loss": 1.6862, "grad_norm": 0.3920327126979828, "learning_rate": 0.0002, "epoch": 1.9743249225320938, "step": 2230}, {"loss": 1.7192, "grad_norm": 0.4765491783618927, "learning_rate": 0.0002, "epoch": 1.9831783975210269, "step": 2240}, {"loss": 1.7759, "grad_norm": 0.38130584359169006, "learning_rate": 0.0002, "epoch": 1.9920318725099602, "step": 2250}, {"eval_loss": 1.8077166080474854, "eval_runtime": 82.8351, "eval_samples_per_second": 6.217, "eval_steps_per_second": 0.785, "epoch": 2.0, "step": 2259}, {"loss": 1.7081, "grad_norm": 0.34340235590934753, "learning_rate": 0.0002, "epoch": 2.0008853474988935, "step": 2260}, {"loss": 1.6815, "grad_norm": 0.3710762858390808, "learning_rate": 0.0002, "epoch": 2.0097388224878263, "step": 2270}, {"loss": 1.5828, "grad_norm": 0.35640114545822144, "learning_rate": 0.0002, "epoch": 2.0185922974767596, "step": 2280}, {"loss": 1.6322, "grad_norm": 0.45970189571380615, "learning_rate": 0.0002, "epoch": 2.027445772465693, "step": 2290}, {"loss": 1.5598, "grad_norm": 0.4256797134876251, "learning_rate": 0.0002, "epoch": 2.0362992474546258, "step": 2300}, {"loss": 1.6271, "grad_norm": 0.42421531677246094, "learning_rate": 0.0002, "epoch": 2.045152722443559, "step": 2310}, {"loss": 1.6117, "grad_norm": 0.4032478928565979, "learning_rate": 0.0002, "epoch": 2.0540061974324924, "step": 2320}, {"loss": 1.6389, "grad_norm": 0.4073623716831207, "learning_rate": 0.0002, "epoch": 2.062859672421425, "step": 2330}, {"loss": 1.6527, "grad_norm": 0.4845200777053833, "learning_rate": 0.0002, "epoch": 2.0717131474103585, "step": 2340}, {"loss": 1.5734, "grad_norm": 0.40578293800354004, "learning_rate": 0.0002, "epoch": 2.080566622399292, "step": 2350}, {"loss": 1.5853, "grad_norm": 0.4037284255027771, "learning_rate": 0.0002, "epoch": 2.089420097388225, "step": 2360}, {"loss": 1.6511, "grad_norm": 0.4717613160610199, "learning_rate": 0.0002, "epoch": 2.098273572377158, "step": 2370}, {"loss": 1.6273, "grad_norm": 0.42076411843299866, "learning_rate": 0.0002, "epoch": 2.1071270473660912, "step": 2380}, {"loss": 1.654, "grad_norm": 0.47799113392829895, "learning_rate": 0.0002, "epoch": 2.1159805223550245, "step": 2390}, {"loss": 1.5528, "grad_norm": 0.4253084063529968, "learning_rate": 0.0002, "epoch": 2.1248339973439574, "step": 2400}, {"loss": 1.6432, "grad_norm": 0.5023085474967957, "learning_rate": 0.0002, "epoch": 2.1336874723328907, "step": 2410}, {"loss": 1.5926, "grad_norm": 0.49162712693214417, "learning_rate": 0.0002, "epoch": 2.142540947321824, "step": 2420}, {"loss": 1.5779, "grad_norm": 0.39035019278526306, "learning_rate": 0.0002, "epoch": 2.151394422310757, "step": 2430}, {"loss": 1.7526, "grad_norm": 0.43223854899406433, "learning_rate": 0.0002, "epoch": 2.16024789729969, "step": 2440}, {"loss": 1.6334, "grad_norm": 0.4596616327762604, "learning_rate": 0.0002, "epoch": 2.1691013722886234, "step": 2450}, {"loss": 1.6067, "grad_norm": 0.4469447731971741, "learning_rate": 0.0002, "epoch": 2.1779548472775563, "step": 2460}, {"loss": 1.5806, "grad_norm": 0.5100595355033875, "learning_rate": 0.0002, "epoch": 2.1868083222664896, "step": 2470}, {"loss": 1.6456, "grad_norm": 0.4169430732727051, "learning_rate": 0.0002, "epoch": 2.195661797255423, "step": 2480}, {"loss": 1.6734, "grad_norm": 0.4699254035949707, "learning_rate": 0.0002, "epoch": 2.2045152722443557, "step": 2490}, {"loss": 1.6259, "grad_norm": 0.43524250388145447, "learning_rate": 0.0002, "epoch": 2.213368747233289, "step": 2500}, {"loss": 1.6717, "grad_norm": 0.4496648907661438, "learning_rate": 0.0002, "epoch": 2.2222222222222223, "step": 2510}, {"loss": 1.6735, "grad_norm": 0.43408212065696716, "learning_rate": 0.0002, "epoch": 2.231075697211155, "step": 2520}, {"loss": 1.611, "grad_norm": 0.4596034288406372, "learning_rate": 0.0002, "epoch": 2.2399291722000885, "step": 2530}, {"loss": 1.6271, "grad_norm": 0.5217021107673645, "learning_rate": 0.0002, "epoch": 2.2487826471890218, "step": 2540}, {"loss": 1.6027, "grad_norm": 0.44745638966560364, "learning_rate": 0.0002, "epoch": 2.2576361221779546, "step": 2550}, {"loss": 1.675, "grad_norm": 0.4484798014163971, "learning_rate": 0.0002, "epoch": 2.266489597166888, "step": 2560}, {"loss": 1.5321, "grad_norm": 0.4428067207336426, "learning_rate": 0.0002, "epoch": 2.275343072155821, "step": 2570}, {"loss": 1.6716, "grad_norm": 0.5095171332359314, "learning_rate": 0.0002, "epoch": 2.2841965471447545, "step": 2580}, {"loss": 1.5661, "grad_norm": 0.44833096861839294, "learning_rate": 0.0002, "epoch": 2.2930500221336874, "step": 2590}, {"loss": 1.652, "grad_norm": 0.507905900478363, "learning_rate": 0.0002, "epoch": 2.3019034971226207, "step": 2600}, {"loss": 1.5963, "grad_norm": 0.40808171033859253, "learning_rate": 0.0002, "epoch": 2.310756972111554, "step": 2610}, {"loss": 1.6574, "grad_norm": 0.4684814214706421, "learning_rate": 0.0002, "epoch": 2.319610447100487, "step": 2620}, {"loss": 1.587, "grad_norm": 0.44864922761917114, "learning_rate": 0.0002, "epoch": 2.32846392208942, "step": 2630}, {"loss": 1.5828, "grad_norm": 0.4174162745475769, "learning_rate": 0.0002, "epoch": 2.3373173970783534, "step": 2640}, {"loss": 1.642, "grad_norm": 0.42314743995666504, "learning_rate": 0.0002, "epoch": 2.3461708720672863, "step": 2650}, {"loss": 1.5884, "grad_norm": 0.49224185943603516, "learning_rate": 0.0002, "epoch": 2.3550243470562195, "step": 2660}, {"loss": 1.5766, "grad_norm": 0.45190292596817017, "learning_rate": 0.0002, "epoch": 2.363877822045153, "step": 2670}, {"loss": 1.6284, "grad_norm": 0.41817107796669006, "learning_rate": 0.0002, "epoch": 2.3727312970340857, "step": 2680}, {"loss": 1.6356, "grad_norm": 0.6436763405799866, "learning_rate": 0.0002, "epoch": 2.381584772023019, "step": 2690}, {"loss": 1.5915, "grad_norm": 0.47175949811935425, "learning_rate": 0.0002, "epoch": 2.3904382470119523, "step": 2700}, {"loss": 1.6303, "grad_norm": 0.480339378118515, "learning_rate": 0.0002, "epoch": 2.3992917220008856, "step": 2710}, {"loss": 1.5697, "grad_norm": 0.4723486006259918, "learning_rate": 0.0002, "epoch": 2.4081451969898184, "step": 2720}, {"loss": 1.54, "grad_norm": 0.4305492043495178, "learning_rate": 0.0002, "epoch": 2.4169986719787517, "step": 2730}, {"loss": 1.71, "grad_norm": 0.5007492303848267, "learning_rate": 0.0002, "epoch": 2.425852146967685, "step": 2740}, {"loss": 1.5369, "grad_norm": 0.5374062061309814, "learning_rate": 0.0002, "epoch": 2.434705621956618, "step": 2750}, {"loss": 1.6156, "grad_norm": 0.45866212248802185, "learning_rate": 0.0002, "epoch": 2.443559096945551, "step": 2760}, {"loss": 1.6066, "grad_norm": 0.47914502024650574, "learning_rate": 0.0002, "epoch": 2.4524125719344845, "step": 2770}, {"loss": 1.5644, "grad_norm": 0.43804746866226196, "learning_rate": 0.0002, "epoch": 2.4612660469234173, "step": 2780}, {"loss": 1.5952, "grad_norm": 0.43656906485557556, "learning_rate": 0.0002, "epoch": 2.4701195219123506, "step": 2790}, {"loss": 1.6311, "grad_norm": 0.4820363521575928, "learning_rate": 0.0002, "epoch": 2.478972996901284, "step": 2800}, {"loss": 1.5375, "grad_norm": 0.4916800558567047, "learning_rate": 0.0002, "epoch": 2.4878264718902168, "step": 2810}, {"loss": 1.5736, "grad_norm": 0.4521256983280182, "learning_rate": 0.0002, "epoch": 2.49667994687915, "step": 2820}, {"loss": 1.6179, "grad_norm": 0.5066806674003601, "learning_rate": 0.0002, "epoch": 2.5055334218680834, "step": 2830}, {"loss": 1.5812, "grad_norm": 0.4768151640892029, "learning_rate": 0.0002, "epoch": 2.514386896857016, "step": 2840}, {"loss": 1.6719, "grad_norm": 0.5144683718681335, "learning_rate": 0.0002, "epoch": 2.5232403718459495, "step": 2850}, {"loss": 1.6063, "grad_norm": 0.4718942940235138, "learning_rate": 0.0002, "epoch": 2.532093846834883, "step": 2860}, {"loss": 1.6099, "grad_norm": 0.4924587309360504, "learning_rate": 0.0002, "epoch": 2.5409473218238157, "step": 2870}, {"loss": 1.5994, "grad_norm": 0.4649953842163086, "learning_rate": 0.0002, "epoch": 2.549800796812749, "step": 2880}, {"loss": 1.6501, "grad_norm": 0.4836665987968445, "learning_rate": 0.0002, "epoch": 2.5586542718016823, "step": 2890}, {"loss": 1.6518, "grad_norm": 0.4162124991416931, "learning_rate": 0.0002, "epoch": 2.567507746790615, "step": 2900}, {"loss": 1.6471, "grad_norm": 0.4894537925720215, "learning_rate": 0.0002, "epoch": 2.5763612217795484, "step": 2910}, {"loss": 1.6123, "grad_norm": 0.4539397358894348, "learning_rate": 0.0002, "epoch": 2.5852146967684817, "step": 2920}, {"loss": 1.6449, "grad_norm": 0.4718773066997528, "learning_rate": 0.0002, "epoch": 2.5940681717574146, "step": 2930}, {"loss": 1.584, "grad_norm": 0.49989837408065796, "learning_rate": 0.0002, "epoch": 2.602921646746348, "step": 2940}, {"loss": 1.6087, "grad_norm": 0.4862406849861145, "learning_rate": 0.0002, "epoch": 2.611775121735281, "step": 2950}, {"loss": 1.6057, "grad_norm": 0.4244804382324219, "learning_rate": 0.0002, "epoch": 2.620628596724214, "step": 2960}, {"loss": 1.7795, "grad_norm": 0.49304354190826416, "learning_rate": 0.0002, "epoch": 2.6294820717131473, "step": 2970}, {"loss": 1.7255, "grad_norm": 0.4818236529827118, "learning_rate": 0.0002, "epoch": 2.6383355467020806, "step": 2980}, {"loss": 1.621, "grad_norm": 0.5077425837516785, "learning_rate": 0.0002, "epoch": 2.647189021691014, "step": 2990}, {"loss": 1.7064, "grad_norm": 0.4494157135486603, "learning_rate": 0.0002, "epoch": 2.6560424966799467, "step": 3000}, {"loss": 1.6792, "grad_norm": 0.4790278971195221, "learning_rate": 0.0002, "epoch": 2.66489597166888, "step": 3010}, {"loss": 1.6082, "grad_norm": 0.4702624976634979, "learning_rate": 0.0002, "epoch": 2.6737494466578133, "step": 3020}, {"loss": 1.6494, "grad_norm": 0.5082133412361145, "learning_rate": 0.0002, "epoch": 2.682602921646746, "step": 3030}, {"loss": 1.6438, "grad_norm": 0.4553256630897522, "learning_rate": 0.0002, "epoch": 2.6914563966356795, "step": 3040}, {"loss": 1.6155, "grad_norm": 0.4492715001106262, "learning_rate": 0.0002, "epoch": 2.700309871624613, "step": 3050}, {"loss": 1.5367, "grad_norm": 0.4555944502353668, "learning_rate": 0.0002, "epoch": 2.709163346613546, "step": 3060}, {"loss": 1.5793, "grad_norm": 0.5879693031311035, "learning_rate": 0.0002, "epoch": 2.718016821602479, "step": 3070}, {"loss": 1.6357, "grad_norm": 0.4628562927246094, "learning_rate": 0.0002, "epoch": 2.7268702965914122, "step": 3080}, {"loss": 1.6585, "grad_norm": 0.5169575810432434, "learning_rate": 0.0002, "epoch": 2.7357237715803455, "step": 3090}, {"loss": 1.562, "grad_norm": 0.4630090892314911, "learning_rate": 0.0002, "epoch": 2.7445772465692784, "step": 3100}, {"loss": 1.5508, "grad_norm": 0.5437219738960266, "learning_rate": 0.0002, "epoch": 2.7534307215582117, "step": 3110}, {"loss": 1.6442, "grad_norm": 0.5102152228355408, "learning_rate": 0.0002, "epoch": 2.762284196547145, "step": 3120}, {"loss": 1.5448, "grad_norm": 0.48287826776504517, "learning_rate": 0.0002, "epoch": 2.771137671536078, "step": 3130}, {"loss": 1.6657, "grad_norm": 0.4671737253665924, "learning_rate": 0.0002, "epoch": 2.779991146525011, "step": 3140}, {"loss": 1.5864, "grad_norm": 0.5177035331726074, "learning_rate": 0.0002, "epoch": 2.7888446215139444, "step": 3150}, {"loss": 1.5617, "grad_norm": 0.450989305973053, "learning_rate": 0.0002, "epoch": 2.7976980965028773, "step": 3160}, {"loss": 1.597, "grad_norm": 0.45007848739624023, "learning_rate": 0.0002, "epoch": 2.8065515714918106, "step": 3170}, {"loss": 1.7179, "grad_norm": 0.4600294530391693, "learning_rate": 0.0002, "epoch": 2.815405046480744, "step": 3180}, {"loss": 1.6441, "grad_norm": 0.485628604888916, "learning_rate": 0.0002, "epoch": 2.8242585214696767, "step": 3190}, {"loss": 1.6396, "grad_norm": 0.49811574816703796, "learning_rate": 0.0002, "epoch": 2.83311199645861, "step": 3200}, {"loss": 1.6067, "grad_norm": 0.5012516975402832, "learning_rate": 0.0002, "epoch": 2.8419654714475433, "step": 3210}, {"loss": 1.6188, "grad_norm": 0.4552757740020752, "learning_rate": 0.0002, "epoch": 2.850818946436476, "step": 3220}, {"loss": 1.5993, "grad_norm": 0.4539635479450226, "learning_rate": 0.0002, "epoch": 2.8596724214254094, "step": 3230}, {"loss": 1.5957, "grad_norm": 0.5534685850143433, "learning_rate": 0.0002, "epoch": 2.8685258964143427, "step": 3240}, {"loss": 1.6065, "grad_norm": 0.4570811688899994, "learning_rate": 0.0002, "epoch": 2.8773793714032756, "step": 3250}, {"loss": 1.6016, "grad_norm": 0.48181653022766113, "learning_rate": 0.0002, "epoch": 2.886232846392209, "step": 3260}, {"loss": 1.6574, "grad_norm": 0.4871032238006592, "learning_rate": 0.0002, "epoch": 2.895086321381142, "step": 3270}, {"loss": 1.5626, "grad_norm": 0.4643239676952362, "learning_rate": 0.0002, "epoch": 2.903939796370075, "step": 3280}, {"loss": 1.5981, "grad_norm": 0.5024484395980835, "learning_rate": 0.0002, "epoch": 2.9127932713590083, "step": 3290}, {"loss": 1.5756, "grad_norm": 0.4425384998321533, "learning_rate": 0.0002, "epoch": 2.9216467463479416, "step": 3300}, {"loss": 1.644, "grad_norm": 0.459168016910553, "learning_rate": 0.0002, "epoch": 2.9305002213368745, "step": 3310}, {"loss": 1.6404, "grad_norm": 0.4950717091560364, "learning_rate": 0.0002, "epoch": 2.939353696325808, "step": 3320}, {"loss": 1.652, "grad_norm": 0.4516230523586273, "learning_rate": 0.0002, "epoch": 2.948207171314741, "step": 3330}, {"loss": 1.5917, "grad_norm": 0.49523285031318665, "learning_rate": 0.0002, "epoch": 2.957060646303674, "step": 3340}, {"loss": 1.733, "grad_norm": 0.49282631278038025, "learning_rate": 0.0002, "epoch": 2.9659141212926072, "step": 3350}, {"loss": 1.6519, "grad_norm": 0.45825016498565674, "learning_rate": 0.0002, "epoch": 2.9747675962815405, "step": 3360}, {"loss": 1.6607, "grad_norm": 0.4952891170978546, "learning_rate": 0.0002, "epoch": 2.983621071270474, "step": 3370}, {"loss": 1.5981, "grad_norm": 0.42182639241218567, "learning_rate": 0.0002, "epoch": 2.9924745462594067, "step": 3380}, {"eval_loss": 1.8308420181274414, "eval_runtime": 82.786, "eval_samples_per_second": 6.221, "eval_steps_per_second": 0.785, "epoch": 2.9995573262505535, "step": 3388}, {"loss": 1.5811, "grad_norm": 0.47721418738365173, "learning_rate": 0.0002, "epoch": 3.00132802124834, "step": 3390}, {"loss": 1.5137, "grad_norm": 0.5284923911094666, "learning_rate": 0.0002, "epoch": 3.0101814962372733, "step": 3400}, {"loss": 1.437, "grad_norm": 0.5607061982154846, "learning_rate": 0.0002, "epoch": 3.019034971226206, "step": 3410}, {"loss": 1.4909, "grad_norm": 0.5271363258361816, "learning_rate": 0.0002, "epoch": 3.0278884462151394, "step": 3420}, {"loss": 1.5645, "grad_norm": 0.48660898208618164, "learning_rate": 0.0002, "epoch": 3.0367419212040727, "step": 3430}, {"loss": 1.4754, "grad_norm": 0.5767933130264282, "learning_rate": 0.0002, "epoch": 3.0455953961930056, "step": 3440}, {"loss": 1.4647, "grad_norm": 0.5591282248497009, "learning_rate": 0.0002, "epoch": 3.054448871181939, "step": 3450}, {"loss": 1.5112, "grad_norm": 0.5870814323425293, "learning_rate": 0.0002, "epoch": 3.063302346170872, "step": 3460}, {"loss": 1.4682, "grad_norm": 0.4861546456813812, "learning_rate": 0.0002, "epoch": 3.072155821159805, "step": 3470}, {"loss": 1.4883, "grad_norm": 0.5238925814628601, "learning_rate": 0.0002, "epoch": 3.0810092961487383, "step": 3480}, {"loss": 1.4855, "grad_norm": 0.5521751046180725, "learning_rate": 0.0002, "epoch": 3.0898627711376716, "step": 3490}, {"loss": 1.4454, "grad_norm": 0.5816575884819031, "learning_rate": 0.0002, "epoch": 3.098716246126605, "step": 3500}, {"loss": 1.5113, "grad_norm": 0.5281513333320618, "learning_rate": 0.0002, "epoch": 3.1075697211155378, "step": 3510}, {"loss": 1.4723, "grad_norm": 0.5847303867340088, "learning_rate": 0.0002, "epoch": 3.116423196104471, "step": 3520}, {"loss": 1.5513, "grad_norm": 0.5683517456054688, "learning_rate": 0.0002, "epoch": 3.1252766710934043, "step": 3530}, {"loss": 1.532, "grad_norm": 0.5177015662193298, "learning_rate": 0.0002, "epoch": 3.134130146082337, "step": 3540}, {"loss": 1.4921, "grad_norm": 0.5922423601150513, "learning_rate": 0.0002, "epoch": 3.1429836210712705, "step": 3550}, {"loss": 1.5329, "grad_norm": 0.7018587589263916, "learning_rate": 0.0002, "epoch": 3.151837096060204, "step": 3560}, {"loss": 1.4677, "grad_norm": 0.6152004599571228, "learning_rate": 0.0002, "epoch": 3.1606905710491366, "step": 3570}, {"loss": 1.4288, "grad_norm": 0.5350717902183533, "learning_rate": 0.0002, "epoch": 3.16954404603807, "step": 3580}, {"loss": 1.4739, "grad_norm": 0.5971009731292725, "learning_rate": 0.0002, "epoch": 3.1783975210270032, "step": 3590}, {"loss": 1.541, "grad_norm": 0.7312001585960388, "learning_rate": 0.0002, "epoch": 3.187250996015936, "step": 3600}, {"loss": 1.5803, "grad_norm": 0.6372535228729248, "learning_rate": 0.0002, "epoch": 3.1961044710048694, "step": 3610}, {"loss": 1.4642, "grad_norm": 0.6098020672798157, "learning_rate": 0.0002, "epoch": 3.2049579459938027, "step": 3620}, {"loss": 1.5149, "grad_norm": 0.5506435632705688, "learning_rate": 0.0002, "epoch": 3.2138114209827355, "step": 3630}, {"loss": 1.4338, "grad_norm": 0.6043022274971008, "learning_rate": 0.0002, "epoch": 3.222664895971669, "step": 3640}, {"loss": 1.5351, "grad_norm": 0.5495519042015076, "learning_rate": 0.0002, "epoch": 3.231518370960602, "step": 3650}, {"loss": 1.3879, "grad_norm": 0.5769572257995605, "learning_rate": 0.0002, "epoch": 3.240371845949535, "step": 3660}, {"loss": 1.4604, "grad_norm": 0.6833786964416504, "learning_rate": 0.0002, "epoch": 3.2492253209384683, "step": 3670}, {"loss": 1.5091, "grad_norm": 0.6962856650352478, "learning_rate": 0.0002, "epoch": 3.2580787959274016, "step": 3680}, {"loss": 1.5212, "grad_norm": 0.6553098559379578, "learning_rate": 0.0002, "epoch": 3.2669322709163344, "step": 3690}, {"loss": 1.5416, "grad_norm": 0.5907557010650635, "learning_rate": 0.0002, "epoch": 3.2757857459052677, "step": 3700}, {"loss": 1.5012, "grad_norm": 0.5712862014770508, "learning_rate": 0.0002, "epoch": 3.284639220894201, "step": 3710}, {"loss": 1.5073, "grad_norm": 0.573820948600769, "learning_rate": 0.0002, "epoch": 3.2934926958831343, "step": 3720}, {"loss": 1.544, "grad_norm": 0.6650304198265076, "learning_rate": 0.0002, "epoch": 3.302346170872067, "step": 3730}, {"loss": 1.5069, "grad_norm": 0.5182583928108215, "learning_rate": 0.0002, "epoch": 3.3111996458610005, "step": 3740}, {"loss": 1.5254, "grad_norm": 0.5078902840614319, "learning_rate": 0.0002, "epoch": 3.3200531208499338, "step": 3750}, {"loss": 1.4881, "grad_norm": 0.7062374353408813, "learning_rate": 0.0002, "epoch": 3.3289065958388666, "step": 3760}, {"loss": 1.5017, "grad_norm": 0.5711262822151184, "learning_rate": 0.0002, "epoch": 3.3377600708278, "step": 3770}, {"loss": 1.4982, "grad_norm": 0.5624606013298035, "learning_rate": 0.0002, "epoch": 3.346613545816733, "step": 3780}, {"loss": 1.4515, "grad_norm": 0.6008231043815613, "learning_rate": 0.0002, "epoch": 3.355467020805666, "step": 3790}, {"loss": 1.5038, "grad_norm": 0.6120018362998962, "learning_rate": 0.0002, "epoch": 3.3643204957945994, "step": 3800}, {"loss": 1.4918, "grad_norm": 0.5679979920387268, "learning_rate": 0.0002, "epoch": 3.3731739707835326, "step": 3810}, {"loss": 1.5435, "grad_norm": 0.5613794922828674, "learning_rate": 0.0002, "epoch": 3.3820274457724655, "step": 3820}, {"loss": 1.5319, "grad_norm": 0.5328839421272278, "learning_rate": 0.0002, "epoch": 3.390880920761399, "step": 3830}, {"loss": 1.5262, "grad_norm": 0.5960017442703247, "learning_rate": 0.0002, "epoch": 3.399734395750332, "step": 3840}, {"loss": 1.4227, "grad_norm": 0.5264106392860413, "learning_rate": 0.0002, "epoch": 3.4085878707392654, "step": 3850}, {"loss": 1.4766, "grad_norm": 0.6378359198570251, "learning_rate": 0.0002, "epoch": 3.4174413457281982, "step": 3860}, {"loss": 1.4898, "grad_norm": 0.5792967677116394, "learning_rate": 0.0002, "epoch": 3.4262948207171315, "step": 3870}, {"loss": 1.4914, "grad_norm": 0.6836280822753906, "learning_rate": 0.0002, "epoch": 3.435148295706065, "step": 3880}, {"loss": 1.5002, "grad_norm": 0.6073971390724182, "learning_rate": 0.0002, "epoch": 3.4440017706949977, "step": 3890}, {"loss": 1.4473, "grad_norm": 0.5753195881843567, "learning_rate": 0.0002, "epoch": 3.452855245683931, "step": 3900}, {"loss": 1.5332, "grad_norm": 0.6007646918296814, "learning_rate": 0.0002, "epoch": 3.4617087206728643, "step": 3910}, {"loss": 1.515, "grad_norm": 0.6025636196136475, "learning_rate": 0.0002, "epoch": 3.470562195661797, "step": 3920}, {"loss": 1.4612, "grad_norm": 0.6819562315940857, "learning_rate": 0.0002, "epoch": 3.4794156706507304, "step": 3930}, {"loss": 1.518, "grad_norm": 0.6448395848274231, "learning_rate": 0.0002, "epoch": 3.4882691456396637, "step": 3940}, {"loss": 1.5194, "grad_norm": 0.5712178945541382, "learning_rate": 0.0002, "epoch": 3.4971226206285966, "step": 3950}, {"loss": 1.4757, "grad_norm": 0.6300532817840576, "learning_rate": 0.0002, "epoch": 3.50597609561753, "step": 3960}, {"loss": 1.5142, "grad_norm": 0.6120840907096863, "learning_rate": 0.0002, "epoch": 3.514829570606463, "step": 3970}, {"loss": 1.559, "grad_norm": 0.6887575387954712, "learning_rate": 0.0002, "epoch": 3.523683045595396, "step": 3980}, {"loss": 1.5591, "grad_norm": 0.6970235109329224, "learning_rate": 0.0002, "epoch": 3.5325365205843293, "step": 3990}, {"loss": 1.5198, "grad_norm": 0.5818213820457458, "learning_rate": 0.0002, "epoch": 3.5413899955732626, "step": 4000}, {"loss": 1.5367, "grad_norm": 1.0533310174942017, "learning_rate": 0.0002, "epoch": 3.5502434705621955, "step": 4010}, {"loss": 1.5399, "grad_norm": 0.5444280505180359, "learning_rate": 0.0002, "epoch": 3.5590969455511288, "step": 4020}, {"loss": 1.5573, "grad_norm": 0.6007506847381592, "learning_rate": 0.0002, "epoch": 3.567950420540062, "step": 4030}, {"loss": 1.5059, "grad_norm": 0.6088743805885315, "learning_rate": 0.0002, "epoch": 3.576803895528995, "step": 4040}, {"loss": 1.5174, "grad_norm": 0.5934239029884338, "learning_rate": 0.0002, "epoch": 3.585657370517928, "step": 4050}, {"loss": 1.4938, "grad_norm": 0.605251669883728, "learning_rate": 0.0002, "epoch": 3.5945108455068615, "step": 4060}, {"loss": 1.5142, "grad_norm": 0.5903469920158386, "learning_rate": 0.0002, "epoch": 3.6033643204957944, "step": 4070}, {"loss": 1.5234, "grad_norm": 0.6752413511276245, "learning_rate": 0.0002, "epoch": 3.6122177954847277, "step": 4080}, {"loss": 1.5041, "grad_norm": 0.5810418725013733, "learning_rate": 0.0002, "epoch": 3.621071270473661, "step": 4090}, {"loss": 1.5358, "grad_norm": 0.5918573141098022, "learning_rate": 0.0002, "epoch": 3.629924745462594, "step": 4100}, {"loss": 1.499, "grad_norm": 0.6635358333587646, "learning_rate": 0.0002, "epoch": 3.638778220451527, "step": 4110}, {"loss": 1.5021, "grad_norm": 0.5785038471221924, "learning_rate": 0.0002, "epoch": 3.6476316954404604, "step": 4120}, {"loss": 1.5711, "grad_norm": 0.5837879776954651, "learning_rate": 0.0002, "epoch": 3.6564851704293937, "step": 4130}, {"loss": 1.4273, "grad_norm": 0.6449324488639832, "learning_rate": 0.0002, "epoch": 3.6653386454183265, "step": 4140}, {"loss": 1.4608, "grad_norm": 0.6191908717155457, "learning_rate": 0.0002, "epoch": 3.67419212040726, "step": 4150}, {"loss": 1.4567, "grad_norm": 0.6937987208366394, "learning_rate": 0.0002, "epoch": 3.683045595396193, "step": 4160}, {"loss": 1.4136, "grad_norm": 0.581128716468811, "learning_rate": 0.0002, "epoch": 3.6918990703851264, "step": 4170}, {"loss": 1.4204, "grad_norm": 0.6547803282737732, "learning_rate": 0.0002, "epoch": 3.7007525453740593, "step": 4180}, {"loss": 1.4653, "grad_norm": 0.5961150527000427, "learning_rate": 0.0002, "epoch": 3.7096060203629926, "step": 4190}, {"loss": 1.4755, "grad_norm": 0.6197913885116577, "learning_rate": 0.0002, "epoch": 3.718459495351926, "step": 4200}, {"loss": 1.5191, "grad_norm": 0.688565194606781, "learning_rate": 0.0002, "epoch": 3.7273129703408587, "step": 4210}, {"loss": 1.5618, "grad_norm": 0.5832270979881287, "learning_rate": 0.0002, "epoch": 3.736166445329792, "step": 4220}, {"loss": 1.4747, "grad_norm": 0.5643884539604187, "learning_rate": 0.0002, "epoch": 3.7450199203187253, "step": 4230}, {"loss": 1.5242, "grad_norm": 0.6236484050750732, "learning_rate": 0.0002, "epoch": 3.753873395307658, "step": 4240}, {"loss": 1.576, "grad_norm": 0.5367720127105713, "learning_rate": 0.0002, "epoch": 3.7627268702965915, "step": 4250}, {"loss": 1.5234, "grad_norm": 0.5785109400749207, "learning_rate": 0.0002, "epoch": 3.7715803452855248, "step": 4260}, {"loss": 1.4947, "grad_norm": 0.5698465704917908, "learning_rate": 0.0002, "epoch": 3.7804338202744576, "step": 4270}, {"loss": 1.4769, "grad_norm": 0.5748036503791809, "learning_rate": 0.0002, "epoch": 3.789287295263391, "step": 4280}, {"loss": 1.5503, "grad_norm": 0.608147382736206, "learning_rate": 0.0002, "epoch": 3.798140770252324, "step": 4290}, {"loss": 1.5354, "grad_norm": 0.5820456147193909, "learning_rate": 0.0002, "epoch": 3.806994245241257, "step": 4300}, {"loss": 1.5668, "grad_norm": 0.6325612664222717, "learning_rate": 0.0002, "epoch": 3.8158477202301904, "step": 4310}, {"loss": 1.5295, "grad_norm": 0.6465362310409546, "learning_rate": 0.0002, "epoch": 3.8247011952191237, "step": 4320}, {"loss": 1.5048, "grad_norm": 0.5630854368209839, "learning_rate": 0.0002, "epoch": 3.8335546702080565, "step": 4330}, {"loss": 1.5636, "grad_norm": 0.6181462407112122, "learning_rate": 0.0002, "epoch": 3.84240814519699, "step": 4340}, {"loss": 1.5113, "grad_norm": 0.6207571029663086, "learning_rate": 0.0002, "epoch": 3.851261620185923, "step": 4350}, {"loss": 1.5424, "grad_norm": 0.6092919111251831, "learning_rate": 0.0002, "epoch": 3.860115095174856, "step": 4360}, {"loss": 1.5214, "grad_norm": 0.6140493750572205, "learning_rate": 0.0002, "epoch": 3.8689685701637893, "step": 4370}, {"loss": 1.5574, "grad_norm": 0.611575722694397, "learning_rate": 0.0002, "epoch": 3.8778220451527226, "step": 4380}, {"loss": 1.5563, "grad_norm": 0.6288794279098511, "learning_rate": 0.0002, "epoch": 3.8866755201416554, "step": 4390}, {"loss": 1.4967, "grad_norm": 0.6518979072570801, "learning_rate": 0.0002, "epoch": 3.8955289951305887, "step": 4400}, {"loss": 1.5366, "grad_norm": 0.6144753098487854, "learning_rate": 0.0002, "epoch": 3.904382470119522, "step": 4410}, {"loss": 1.6285, "grad_norm": 0.7034937143325806, "learning_rate": 0.0002, "epoch": 3.913235945108455, "step": 4420}, {"loss": 1.4978, "grad_norm": 0.5713187456130981, "learning_rate": 0.0002, "epoch": 3.922089420097388, "step": 4430}, {"loss": 1.5532, "grad_norm": 0.6187576651573181, "learning_rate": 0.0002, "epoch": 3.9309428950863214, "step": 4440}, {"loss": 1.551, "grad_norm": 0.6439383029937744, "learning_rate": 0.0002, "epoch": 3.9397963700752543, "step": 4450}, {"loss": 1.5073, "grad_norm": 0.6133334636688232, "learning_rate": 0.0002, "epoch": 3.9486498450641876, "step": 4460}, {"loss": 1.538, "grad_norm": 0.593463659286499, "learning_rate": 0.0002, "epoch": 3.957503320053121, "step": 4470}, {"loss": 1.5636, "grad_norm": 0.6261998414993286, "learning_rate": 0.0002, "epoch": 3.9663567950420537, "step": 4480}, {"loss": 1.4888, "grad_norm": 0.6153767704963684, "learning_rate": 0.0002, "epoch": 3.975210270030987, "step": 4490}, {"loss": 1.4986, "grad_norm": 0.6184002757072449, "learning_rate": 0.0002, "epoch": 3.9840637450199203, "step": 4500}, {"loss": 1.5134, "grad_norm": 0.5212734341621399, "learning_rate": 0.0002, "epoch": 3.9929172200088536, "step": 4510}, {"eval_loss": 1.8745536804199219, "eval_runtime": 83.0125, "eval_samples_per_second": 6.204, "eval_steps_per_second": 0.783, "epoch": 4.0, "step": 4518}, {"loss": 1.4708, "grad_norm": 0.5871603488922119, "learning_rate": 0.0002, "epoch": 4.001770694997787, "step": 4520}, {"loss": 1.4139, "grad_norm": 0.6746091842651367, "learning_rate": 0.0002, "epoch": 4.01062416998672, "step": 4530}, {"loss": 1.3625, "grad_norm": 0.6159639358520508, "learning_rate": 0.0002, "epoch": 4.019477644975653, "step": 4540}, {"loss": 1.3766, "grad_norm": 0.7529398202896118, "learning_rate": 0.0002, "epoch": 4.028331119964586, "step": 4550}, {"loss": 1.3202, "grad_norm": 0.788398027420044, "learning_rate": 0.0002, "epoch": 4.037184594953519, "step": 4560}, {"loss": 1.4254, "grad_norm": 0.9679850935935974, "learning_rate": 0.0002, "epoch": 4.046038069942452, "step": 4570}, {"loss": 1.2911, "grad_norm": 0.6305310130119324, "learning_rate": 0.0002, "epoch": 4.054891544931386, "step": 4580}, {"loss": 1.3525, "grad_norm": 0.8557451963424683, "learning_rate": 0.0002, "epoch": 4.063745019920319, "step": 4590}, {"loss": 1.3901, "grad_norm": 0.741518497467041, "learning_rate": 0.0002, "epoch": 4.0725984949092515, "step": 4600}, {"loss": 1.3374, "grad_norm": 0.6573862433433533, "learning_rate": 0.0002, "epoch": 4.081451969898185, "step": 4610}, {"loss": 1.3341, "grad_norm": 0.6926319599151611, "learning_rate": 0.0002, "epoch": 4.090305444887118, "step": 4620}, {"loss": 1.4176, "grad_norm": 0.9212626218795776, "learning_rate": 0.0002, "epoch": 4.099158919876051, "step": 4630}, {"loss": 1.3402, "grad_norm": 0.7167867422103882, "learning_rate": 0.0002, "epoch": 4.108012394864985, "step": 4640}, {"loss": 1.3333, "grad_norm": 0.6691595911979675, "learning_rate": 0.0002, "epoch": 4.116865869853918, "step": 4650}, {"loss": 1.247, "grad_norm": 0.8708247542381287, "learning_rate": 0.0002, "epoch": 4.12571934484285, "step": 4660}, {"loss": 1.3599, "grad_norm": 0.8612170219421387, "learning_rate": 0.0002, "epoch": 4.134572819831784, "step": 4670}, {"loss": 1.3418, "grad_norm": 0.7688325047492981, "learning_rate": 0.0002, "epoch": 4.143426294820717, "step": 4680}, {"loss": 1.4349, "grad_norm": 0.7606917023658752, "learning_rate": 0.0002, "epoch": 4.152279769809651, "step": 4690}, {"loss": 1.3521, "grad_norm": 0.8241282105445862, "learning_rate": 0.0002, "epoch": 4.161133244798584, "step": 4700}, {"loss": 1.3325, "grad_norm": 0.7480464577674866, "learning_rate": 0.0002, "epoch": 4.1699867197875164, "step": 4710}, {"loss": 1.4027, "grad_norm": 0.7092460989952087, "learning_rate": 0.0002, "epoch": 4.17884019477645, "step": 4720}, {"loss": 1.4005, "grad_norm": 0.8782108426094055, "learning_rate": 0.0002, "epoch": 4.187693669765383, "step": 4730}, {"loss": 1.3626, "grad_norm": 0.6875300407409668, "learning_rate": 0.0002, "epoch": 4.196547144754316, "step": 4740}, {"loss": 1.3798, "grad_norm": 0.7713887691497803, "learning_rate": 0.0002, "epoch": 4.20540061974325, "step": 4750}, {"loss": 1.3822, "grad_norm": 0.8270819783210754, "learning_rate": 0.0002, "epoch": 4.2142540947321825, "step": 4760}, {"loss": 1.3559, "grad_norm": 0.7109288573265076, "learning_rate": 0.0002, "epoch": 4.223107569721115, "step": 4770}, {"loss": 1.3948, "grad_norm": 0.7209359407424927, "learning_rate": 0.0002, "epoch": 4.231961044710049, "step": 4780}, {"loss": 1.3691, "grad_norm": 0.7142833471298218, "learning_rate": 0.0002, "epoch": 4.240814519698982, "step": 4790}, {"loss": 1.3654, "grad_norm": 0.8526809811592102, "learning_rate": 0.0002, "epoch": 4.249667994687915, "step": 4800}, {"loss": 1.3819, "grad_norm": 0.7064695954322815, "learning_rate": 0.0002, "epoch": 4.2585214696768485, "step": 4810}, {"loss": 1.3333, "grad_norm": 0.7646124362945557, "learning_rate": 0.0002, "epoch": 4.267374944665781, "step": 4820}, {"loss": 1.4247, "grad_norm": 0.7377115488052368, "learning_rate": 0.0002, "epoch": 4.276228419654714, "step": 4830}, {"loss": 1.3683, "grad_norm": 0.7308453321456909, "learning_rate": 0.0002, "epoch": 4.285081894643648, "step": 4840}, {"loss": 1.3653, "grad_norm": 0.6687684059143066, "learning_rate": 0.0002, "epoch": 4.293935369632581, "step": 4850}, {"loss": 1.3538, "grad_norm": 0.7447634339332581, "learning_rate": 0.0002, "epoch": 4.302788844621514, "step": 4860}, {"loss": 1.3842, "grad_norm": 0.7661601305007935, "learning_rate": 0.0002, "epoch": 4.311642319610447, "step": 4870}, {"loss": 1.3783, "grad_norm": 0.7492215037345886, "learning_rate": 0.0002, "epoch": 4.32049579459938, "step": 4880}, {"loss": 1.4089, "grad_norm": 0.9554458856582642, "learning_rate": 0.0002, "epoch": 4.329349269588313, "step": 4890}, {"loss": 1.3582, "grad_norm": 0.7409822940826416, "learning_rate": 0.0002, "epoch": 4.338202744577247, "step": 4900}, {"loss": 1.2581, "grad_norm": 0.9848645329475403, "learning_rate": 0.0002, "epoch": 4.34705621956618, "step": 4910}, {"loss": 1.3809, "grad_norm": 0.803995668888092, "learning_rate": 0.0002, "epoch": 4.355909694555113, "step": 4920}, {"loss": 1.3585, "grad_norm": 0.7480606436729431, "learning_rate": 0.0002, "epoch": 4.364763169544046, "step": 4930}, {"loss": 1.4092, "grad_norm": 0.7018141150474548, "learning_rate": 0.0002, "epoch": 4.373616644532979, "step": 4940}, {"loss": 1.4034, "grad_norm": 0.7684932351112366, "learning_rate": 0.0002, "epoch": 4.382470119521912, "step": 4950}, {"loss": 1.3937, "grad_norm": 0.7849185466766357, "learning_rate": 0.0002, "epoch": 4.391323594510846, "step": 4960}, {"loss": 1.3763, "grad_norm": 0.7858862280845642, "learning_rate": 0.0002, "epoch": 4.400177069499779, "step": 4970}, {"loss": 1.3901, "grad_norm": 0.8270778059959412, "learning_rate": 0.0002, "epoch": 4.4090305444887115, "step": 4980}, {"loss": 1.445, "grad_norm": 0.8464101552963257, "learning_rate": 0.0002, "epoch": 4.417884019477645, "step": 4990}, {"loss": 1.3586, "grad_norm": 0.85670405626297, "learning_rate": 0.0002, "epoch": 4.426737494466578, "step": 5000}, {"loss": 1.4203, "grad_norm": 0.8656655550003052, "learning_rate": 0.0002, "epoch": 4.435590969455511, "step": 5010}, {"loss": 1.3426, "grad_norm": 0.7605292201042175, "learning_rate": 0.0002, "epoch": 4.444444444444445, "step": 5020}, {"loss": 1.3803, "grad_norm": 0.7682471871376038, "learning_rate": 0.0002, "epoch": 4.4532979194333775, "step": 5030}, {"loss": 1.3432, "grad_norm": 0.7209102511405945, "learning_rate": 0.0002, "epoch": 4.46215139442231, "step": 5040}, {"loss": 1.5126, "grad_norm": 0.8259989023208618, "learning_rate": 0.0002, "epoch": 4.471004869411244, "step": 5050}, {"loss": 1.3709, "grad_norm": 0.7342197895050049, "learning_rate": 0.0002, "epoch": 4.479858344400177, "step": 5060}, {"loss": 1.4196, "grad_norm": 0.7869040369987488, "learning_rate": 0.0002, "epoch": 4.48871181938911, "step": 5070}, {"loss": 1.3734, "grad_norm": 0.7906143665313721, "learning_rate": 0.0002, "epoch": 4.4975652943780435, "step": 5080}, {"loss": 1.3555, "grad_norm": 0.7336861491203308, "learning_rate": 0.0002, "epoch": 4.506418769366976, "step": 5090}, {"loss": 1.3768, "grad_norm": 0.8264166712760925, "learning_rate": 0.0002, "epoch": 4.515272244355909, "step": 5100}, {"loss": 1.3822, "grad_norm": 0.8144693970680237, "learning_rate": 0.0002, "epoch": 4.524125719344843, "step": 5110}, {"loss": 1.3044, "grad_norm": 0.8257269263267517, "learning_rate": 0.0002, "epoch": 4.532979194333776, "step": 5120}, {"loss": 1.3501, "grad_norm": 0.8838174343109131, "learning_rate": 0.0002, "epoch": 4.541832669322709, "step": 5130}, {"loss": 1.3464, "grad_norm": 0.7081145644187927, "learning_rate": 0.0002, "epoch": 4.550686144311642, "step": 5140}, {"loss": 1.342, "grad_norm": 0.7137823700904846, "learning_rate": 0.0002, "epoch": 4.559539619300575, "step": 5150}, {"loss": 1.3788, "grad_norm": 0.7890386581420898, "learning_rate": 0.0002, "epoch": 4.568393094289509, "step": 5160}, {"loss": 1.3368, "grad_norm": 0.6418015360832214, "learning_rate": 0.0002, "epoch": 4.577246569278442, "step": 5170}, {"loss": 1.3892, "grad_norm": 0.768373966217041, "learning_rate": 0.0002, "epoch": 4.586100044267375, "step": 5180}, {"loss": 1.3953, "grad_norm": 0.6934067606925964, "learning_rate": 0.0002, "epoch": 4.5949535192563085, "step": 5190}, {"loss": 1.3782, "grad_norm": 0.9430719017982483, "learning_rate": 0.0002, "epoch": 4.603806994245241, "step": 5200}, {"loss": 1.3981, "grad_norm": 0.880264163017273, "learning_rate": 0.0002, "epoch": 4.612660469234174, "step": 5210}, {"loss": 1.3506, "grad_norm": 0.7584623098373413, "learning_rate": 0.0002, "epoch": 4.621513944223108, "step": 5220}, {"loss": 1.3973, "grad_norm": 0.7974506616592407, "learning_rate": 0.0002, "epoch": 4.630367419212041, "step": 5230}, {"loss": 1.3818, "grad_norm": 0.8812133073806763, "learning_rate": 0.0002, "epoch": 4.639220894200974, "step": 5240}, {"loss": 1.4002, "grad_norm": 0.8968724012374878, "learning_rate": 0.0002, "epoch": 4.648074369189907, "step": 5250}, {"loss": 1.3327, "grad_norm": 0.7317764759063721, "learning_rate": 0.0002, "epoch": 4.65692784417884, "step": 5260}, {"loss": 1.4363, "grad_norm": 0.7415484189987183, "learning_rate": 0.0002, "epoch": 4.665781319167773, "step": 5270}, {"loss": 1.3673, "grad_norm": 0.7867009043693542, "learning_rate": 0.0002, "epoch": 4.674634794156707, "step": 5280}, {"loss": 1.4246, "grad_norm": 0.6895416378974915, "learning_rate": 0.0002, "epoch": 4.68348826914564, "step": 5290}, {"loss": 1.3438, "grad_norm": 0.7324506640434265, "learning_rate": 0.0002, "epoch": 4.6923417441345725, "step": 5300}, {"loss": 1.4072, "grad_norm": 0.7383193969726562, "learning_rate": 0.0002, "epoch": 4.701195219123506, "step": 5310}, {"loss": 1.3269, "grad_norm": 0.8254916071891785, "learning_rate": 0.0002, "epoch": 4.710048694112439, "step": 5320}, {"loss": 1.4317, "grad_norm": 0.8161033987998962, "learning_rate": 0.0002, "epoch": 4.718902169101372, "step": 5330}, {"loss": 1.3623, "grad_norm": 0.7664386034011841, "learning_rate": 0.0002, "epoch": 4.727755644090306, "step": 5340}, {"loss": 1.4293, "grad_norm": 0.7465475797653198, "learning_rate": 0.0002, "epoch": 4.7366091190792385, "step": 5350}, {"loss": 1.3435, "grad_norm": 0.7810078263282776, "learning_rate": 0.0002, "epoch": 4.745462594068171, "step": 5360}, {"loss": 1.4489, "grad_norm": 0.7428439855575562, "learning_rate": 0.0002, "epoch": 4.754316069057105, "step": 5370}, {"loss": 1.3607, "grad_norm": 0.9548320174217224, "learning_rate": 0.0002, "epoch": 4.763169544046038, "step": 5380}, {"loss": 1.3398, "grad_norm": 0.7959533333778381, "learning_rate": 0.0002, "epoch": 4.772023019034972, "step": 5390}, {"loss": 1.3448, "grad_norm": 0.747473418712616, "learning_rate": 0.0002, "epoch": 4.780876494023905, "step": 5400}, {"loss": 1.3954, "grad_norm": 0.7863122820854187, "learning_rate": 0.0002, "epoch": 4.789729969012837, "step": 5410}, {"loss": 1.4166, "grad_norm": 0.7769626379013062, "learning_rate": 0.0002, "epoch": 4.798583444001771, "step": 5420}, {"loss": 1.4484, "grad_norm": 0.8551191091537476, "learning_rate": 0.0002, "epoch": 4.807436918990704, "step": 5430}, {"loss": 1.4314, "grad_norm": 0.8364850878715515, "learning_rate": 0.0002, "epoch": 4.816290393979637, "step": 5440}, {"loss": 1.4028, "grad_norm": 0.7458856701850891, "learning_rate": 0.0002, "epoch": 4.825143868968571, "step": 5450}, {"loss": 1.3923, "grad_norm": 0.7558291554450989, "learning_rate": 0.0002, "epoch": 4.8339973439575035, "step": 5460}, {"loss": 1.3343, "grad_norm": 0.8396534323692322, "learning_rate": 0.0002, "epoch": 4.842850818946436, "step": 5470}, {"loss": 1.3853, "grad_norm": 0.7790794968605042, "learning_rate": 0.0002, "epoch": 4.85170429393537, "step": 5480}, {"loss": 1.406, "grad_norm": 0.8607641458511353, "learning_rate": 0.0002, "epoch": 4.860557768924303, "step": 5490}, {"loss": 1.4011, "grad_norm": 0.828134298324585, "learning_rate": 0.0002, "epoch": 4.869411243913236, "step": 5500}, {"loss": 1.4089, "grad_norm": 0.8783106803894043, "learning_rate": 0.0002, "epoch": 4.8782647189021695, "step": 5510}, {"loss": 1.4565, "grad_norm": 0.7476183176040649, "learning_rate": 0.0002, "epoch": 4.887118193891102, "step": 5520}, {"loss": 1.3974, "grad_norm": 0.8023254871368408, "learning_rate": 0.0002, "epoch": 4.895971668880035, "step": 5530}, {"loss": 1.2979, "grad_norm": 0.8021706938743591, "learning_rate": 0.0002, "epoch": 4.904825143868969, "step": 5540}, {"loss": 1.4139, "grad_norm": 0.7873618602752686, "learning_rate": 0.0002, "epoch": 4.913678618857902, "step": 5550}, {"loss": 1.4393, "grad_norm": 0.7181428670883179, "learning_rate": 0.0002, "epoch": 4.922532093846835, "step": 5560}, {"loss": 1.3968, "grad_norm": 0.7464273571968079, "learning_rate": 0.0002, "epoch": 4.931385568835768, "step": 5570}, {"loss": 1.3184, "grad_norm": 0.7433671355247498, "learning_rate": 0.0002, "epoch": 4.940239043824701, "step": 5580}, {"loss": 1.4174, "grad_norm": 0.7571114301681519, "learning_rate": 0.0002, "epoch": 4.949092518813634, "step": 5590}, {"loss": 1.4418, "grad_norm": 0.7811630964279175, "learning_rate": 0.0002, "epoch": 4.957945993802568, "step": 5600}, {"loss": 1.4288, "grad_norm": 0.7609148621559143, "learning_rate": 0.0002, "epoch": 4.966799468791501, "step": 5610}, {"loss": 1.3786, "grad_norm": 0.7324382066726685, "learning_rate": 0.0002, "epoch": 4.9756529437804335, "step": 5620}, {"loss": 1.4557, "grad_norm": 0.9249559640884399, "learning_rate": 0.0002, "epoch": 4.984506418769367, "step": 5630}, {"loss": 1.4064, "grad_norm": 0.7852522134780884, "learning_rate": 0.0002, "epoch": 4.9933598937583, "step": 5640}, {"eval_loss": 1.9384633302688599, "eval_runtime": 82.6042, "eval_samples_per_second": 6.235, "eval_steps_per_second": 0.787, "epoch": 4.999557326250553, "step": 5647}, {"loss": 1.4261, "grad_norm": 0.8052749037742615, "learning_rate": 0.0002, "epoch": 5.002213368747233, "step": 5650}, {"loss": 1.1967, "grad_norm": 1.380603551864624, "learning_rate": 0.0002, "epoch": 5.011066843736167, "step": 5660}, {"loss": 1.1871, "grad_norm": 0.9197829365730286, "learning_rate": 0.0002, "epoch": 5.0199203187251, "step": 5670}, {"loss": 1.1966, "grad_norm": 0.9338570833206177, "learning_rate": 0.0002, "epoch": 5.028773793714032, "step": 5680}, {"loss": 1.1866, "grad_norm": 1.0464060306549072, "learning_rate": 0.0002, "epoch": 5.037627268702966, "step": 5690}, {"loss": 1.2211, "grad_norm": 0.9055638909339905, "learning_rate": 0.0002, "epoch": 5.046480743691899, "step": 5700}, {"loss": 1.1987, "grad_norm": 0.9494627714157104, "learning_rate": 0.0002, "epoch": 5.055334218680832, "step": 5710}, {"loss": 1.2647, "grad_norm": 0.9680962562561035, "learning_rate": 0.0002, "epoch": 5.064187693669766, "step": 5720}, {"loss": 1.2452, "grad_norm": 1.0254695415496826, "learning_rate": 0.0002, "epoch": 5.0730411686586985, "step": 5730}, {"loss": 1.2006, "grad_norm": 0.9306758642196655, "learning_rate": 0.0002, "epoch": 5.081894643647631, "step": 5740}, {"loss": 1.2254, "grad_norm": 1.0620356798171997, "learning_rate": 0.0002, "epoch": 5.090748118636565, "step": 5750}, {"loss": 1.2628, "grad_norm": 1.0401700735092163, "learning_rate": 0.0002, "epoch": 5.099601593625498, "step": 5760}, {"loss": 1.1976, "grad_norm": 0.9916906952857971, "learning_rate": 0.0002, "epoch": 5.108455068614431, "step": 5770}, {"loss": 1.2847, "grad_norm": 0.8387252688407898, "learning_rate": 0.0002, "epoch": 5.1173085436033645, "step": 5780}, {"loss": 1.2472, "grad_norm": 0.9870850443840027, "learning_rate": 0.0002, "epoch": 5.126162018592297, "step": 5790}, {"loss": 1.1902, "grad_norm": 0.9204064011573792, "learning_rate": 0.0002, "epoch": 5.13501549358123, "step": 5800}, {"loss": 1.2266, "grad_norm": 0.9951931834220886, "learning_rate": 0.0002, "epoch": 5.143868968570164, "step": 5810}, {"loss": 1.2113, "grad_norm": 0.9745809435844421, "learning_rate": 0.0002, "epoch": 5.152722443559097, "step": 5820}, {"loss": 1.2549, "grad_norm": 0.9467785954475403, "learning_rate": 0.0002, "epoch": 5.16157591854803, "step": 5830}, {"loss": 1.2309, "grad_norm": 1.0451668500900269, "learning_rate": 0.0002, "epoch": 5.170429393536963, "step": 5840}, {"loss": 1.2215, "grad_norm": 0.9740142822265625, "learning_rate": 0.0002, "epoch": 5.179282868525896, "step": 5850}, {"loss": 1.2137, "grad_norm": 1.2158266305923462, "learning_rate": 0.0002, "epoch": 5.18813634351483, "step": 5860}, {"loss": 1.1631, "grad_norm": 1.0795036554336548, "learning_rate": 0.0002, "epoch": 5.196989818503763, "step": 5870}, {"loss": 1.1448, "grad_norm": 0.9578470587730408, "learning_rate": 0.0002, "epoch": 5.205843293492696, "step": 5880}, {"loss": 1.2183, "grad_norm": 0.8887509703636169, "learning_rate": 0.0002, "epoch": 5.214696768481629, "step": 5890}, {"loss": 1.1991, "grad_norm": 1.171006441116333, "learning_rate": 0.0002, "epoch": 5.223550243470562, "step": 5900}, {"loss": 1.1781, "grad_norm": 0.9016029834747314, "learning_rate": 0.0002, "epoch": 5.232403718459495, "step": 5910}, {"loss": 1.2057, "grad_norm": 1.173136830329895, "learning_rate": 0.0002, "epoch": 5.241257193448429, "step": 5920}, {"loss": 1.2856, "grad_norm": 0.8760318160057068, "learning_rate": 0.0002, "epoch": 5.250110668437362, "step": 5930}, {"loss": 1.2301, "grad_norm": 0.8998854160308838, "learning_rate": 0.0002, "epoch": 5.258964143426295, "step": 5940}, {"loss": 1.3058, "grad_norm": 1.017175316810608, "learning_rate": 0.0002, "epoch": 5.267817618415228, "step": 5950}, {"loss": 1.2552, "grad_norm": 0.8646609783172607, "learning_rate": 0.0002, "epoch": 5.276671093404161, "step": 5960}, {"loss": 1.2044, "grad_norm": 1.0030627250671387, "learning_rate": 0.0002, "epoch": 5.285524568393094, "step": 5970}, {"loss": 1.2365, "grad_norm": 0.975911557674408, "learning_rate": 0.0002, "epoch": 5.294378043382028, "step": 5980}, {"loss": 1.2307, "grad_norm": 0.9576130509376526, "learning_rate": 0.0002, "epoch": 5.303231518370961, "step": 5990}, {"loss": 1.2681, "grad_norm": 0.9566167593002319, "learning_rate": 0.0002, "epoch": 5.3120849933598935, "step": 6000}, {"loss": 1.2029, "grad_norm": 0.9200350642204285, "learning_rate": 0.0002, "epoch": 5.320938468348827, "step": 6010}, {"loss": 1.1871, "grad_norm": 1.0491118431091309, "learning_rate": 0.0002, "epoch": 5.32979194333776, "step": 6020}, {"loss": 1.2531, "grad_norm": 1.1199153661727905, "learning_rate": 0.0002, "epoch": 5.338645418326693, "step": 6030}, {"loss": 1.265, "grad_norm": 1.015252947807312, "learning_rate": 0.0002, "epoch": 5.347498893315627, "step": 6040}, {"loss": 1.2208, "grad_norm": 1.1076666116714478, "learning_rate": 0.0002, "epoch": 5.3563523683045595, "step": 6050}, {"loss": 1.1953, "grad_norm": 0.9224653840065002, "learning_rate": 0.0002, "epoch": 5.365205843293492, "step": 6060}, {"loss": 1.2045, "grad_norm": 1.0079779624938965, "learning_rate": 0.0002, "epoch": 5.374059318282426, "step": 6070}, {"loss": 1.2612, "grad_norm": 0.9627894759178162, "learning_rate": 0.0002, "epoch": 5.382912793271359, "step": 6080}, {"loss": 1.3116, "grad_norm": 1.0503166913986206, "learning_rate": 0.0002, "epoch": 5.391766268260292, "step": 6090}, {"loss": 1.2565, "grad_norm": 0.912736713886261, "learning_rate": 0.0002, "epoch": 5.400619743249226, "step": 6100}, {"loss": 1.204, "grad_norm": 1.2552032470703125, "learning_rate": 0.0002, "epoch": 5.409473218238158, "step": 6110}, {"loss": 1.2738, "grad_norm": 0.986230731010437, "learning_rate": 0.0002, "epoch": 5.418326693227091, "step": 6120}, {"loss": 1.3301, "grad_norm": 0.9869757294654846, "learning_rate": 0.0002, "epoch": 5.427180168216025, "step": 6130}, {"loss": 1.241, "grad_norm": 1.012027621269226, "learning_rate": 0.0002, "epoch": 5.436033643204958, "step": 6140}, {"loss": 1.224, "grad_norm": 0.8855568170547485, "learning_rate": 0.0002, "epoch": 5.444887118193891, "step": 6150}, {"loss": 1.2539, "grad_norm": 1.1522414684295654, "learning_rate": 0.0002, "epoch": 5.4537405931828244, "step": 6160}, {"loss": 1.2402, "grad_norm": 1.2448474168777466, "learning_rate": 0.0002, "epoch": 5.462594068171757, "step": 6170}, {"loss": 1.179, "grad_norm": 1.0362223386764526, "learning_rate": 0.0002, "epoch": 5.471447543160691, "step": 6180}, {"loss": 1.2351, "grad_norm": 0.9363031983375549, "learning_rate": 0.0002, "epoch": 5.480301018149624, "step": 6190}, {"loss": 1.2394, "grad_norm": 0.8852020502090454, "learning_rate": 0.0002, "epoch": 5.489154493138557, "step": 6200}, {"loss": 1.311, "grad_norm": 0.8577062487602234, "learning_rate": 0.0002, "epoch": 5.4980079681274905, "step": 6210}, {"loss": 1.2547, "grad_norm": 0.9351891875267029, "learning_rate": 0.0002, "epoch": 5.506861443116423, "step": 6220}, {"loss": 1.2804, "grad_norm": 1.0031992197036743, "learning_rate": 0.0002, "epoch": 5.515714918105356, "step": 6230}, {"loss": 1.219, "grad_norm": 0.9935104250907898, "learning_rate": 0.0002, "epoch": 5.52456839309429, "step": 6240}, {"loss": 1.2756, "grad_norm": 1.1086243391036987, "learning_rate": 0.0002, "epoch": 5.533421868083223, "step": 6250}, {"loss": 1.2751, "grad_norm": 0.990772545337677, "learning_rate": 0.0002, "epoch": 5.542275343072156, "step": 6260}, {"loss": 1.2756, "grad_norm": 0.9317597150802612, "learning_rate": 0.0002, "epoch": 5.551128818061089, "step": 6270}, {"loss": 1.2095, "grad_norm": 0.9657552242279053, "learning_rate": 0.0002, "epoch": 5.559982293050022, "step": 6280}, {"loss": 1.2435, "grad_norm": 1.0271565914154053, "learning_rate": 0.0002, "epoch": 5.568835768038955, "step": 6290}, {"loss": 1.2283, "grad_norm": 0.916253924369812, "learning_rate": 0.0002, "epoch": 5.577689243027889, "step": 6300}, {"loss": 1.2648, "grad_norm": 1.0083940029144287, "learning_rate": 0.0002, "epoch": 5.586542718016822, "step": 6310}, {"loss": 1.2904, "grad_norm": 0.9740358591079712, "learning_rate": 0.0002, "epoch": 5.5953961930057545, "step": 6320}, {"loss": 1.2507, "grad_norm": 0.9645405411720276, "learning_rate": 0.0002, "epoch": 5.604249667994688, "step": 6330}, {"loss": 1.2845, "grad_norm": 0.9677100777626038, "learning_rate": 0.0002, "epoch": 5.613103142983621, "step": 6340}, {"loss": 1.2936, "grad_norm": 0.9706602692604065, "learning_rate": 0.0002, "epoch": 5.621956617972554, "step": 6350}, {"loss": 1.2541, "grad_norm": 1.1492316722869873, "learning_rate": 0.0002, "epoch": 5.630810092961488, "step": 6360}, {"loss": 1.2242, "grad_norm": 0.8857277035713196, "learning_rate": 0.0002, "epoch": 5.639663567950421, "step": 6370}, {"loss": 1.2178, "grad_norm": 1.0363037586212158, "learning_rate": 0.0002, "epoch": 5.648517042939353, "step": 6380}, {"loss": 1.1838, "grad_norm": 0.9621800780296326, "learning_rate": 0.0002, "epoch": 5.657370517928287, "step": 6390}, {"loss": 1.2472, "grad_norm": 0.9937820434570312, "learning_rate": 0.0002, "epoch": 5.66622399291722, "step": 6400}, {"loss": 1.2523, "grad_norm": 0.9491283297538757, "learning_rate": 0.0002, "epoch": 5.675077467906153, "step": 6410}, {"loss": 1.2539, "grad_norm": 0.9429448246955872, "learning_rate": 0.0002, "epoch": 5.683930942895087, "step": 6420}, {"loss": 1.1663, "grad_norm": 0.9808844327926636, "learning_rate": 0.0002, "epoch": 5.6927844178840195, "step": 6430}, {"loss": 1.2574, "grad_norm": 0.8191056847572327, "learning_rate": 0.0002, "epoch": 5.701637892872952, "step": 6440}, {"loss": 1.2659, "grad_norm": 1.1118974685668945, "learning_rate": 0.0002, "epoch": 5.710491367861886, "step": 6450}, {"loss": 1.2192, "grad_norm": 0.9030969142913818, "learning_rate": 0.0002, "epoch": 5.719344842850819, "step": 6460}, {"loss": 1.301, "grad_norm": 1.0509997606277466, "learning_rate": 0.0002, "epoch": 5.728198317839752, "step": 6470}, {"loss": 1.217, "grad_norm": 1.0369981527328491, "learning_rate": 0.0002, "epoch": 5.7370517928286855, "step": 6480}, {"loss": 1.2518, "grad_norm": 0.8626071214675903, "learning_rate": 0.0002, "epoch": 5.745905267817618, "step": 6490}, {"loss": 1.2446, "grad_norm": 1.0448849201202393, "learning_rate": 0.0002, "epoch": 5.754758742806551, "step": 6500}, {"loss": 1.2698, "grad_norm": 0.9333119988441467, "learning_rate": 0.0002, "epoch": 5.763612217795485, "step": 6510}, {"loss": 1.2655, "grad_norm": 0.8533532023429871, "learning_rate": 0.0002, "epoch": 5.772465692784418, "step": 6520}, {"loss": 1.3037, "grad_norm": 0.9774261713027954, "learning_rate": 0.0002, "epoch": 5.781319167773351, "step": 6530}, {"loss": 1.2031, "grad_norm": 0.9841071963310242, "learning_rate": 0.0002, "epoch": 5.790172642762284, "step": 6540}, {"loss": 1.2767, "grad_norm": 0.9891805052757263, "learning_rate": 0.0002, "epoch": 5.799026117751217, "step": 6550}, {"loss": 1.3373, "grad_norm": 0.9633952379226685, "learning_rate": 0.0002, "epoch": 5.80787959274015, "step": 6560}, {"loss": 1.1939, "grad_norm": 1.327634334564209, "learning_rate": 0.0002, "epoch": 5.816733067729084, "step": 6570}, {"loss": 1.2985, "grad_norm": 0.9805197715759277, "learning_rate": 0.0002, "epoch": 5.825586542718017, "step": 6580}, {"loss": 1.1933, "grad_norm": 1.020957589149475, "learning_rate": 0.0002, "epoch": 5.8344400177069495, "step": 6590}, {"loss": 1.2582, "grad_norm": 0.9694032669067383, "learning_rate": 0.0002, "epoch": 5.843293492695883, "step": 6600}, {"loss": 1.2671, "grad_norm": 0.8980914354324341, "learning_rate": 0.0002, "epoch": 5.852146967684816, "step": 6610}, {"loss": 1.3391, "grad_norm": 0.8312330842018127, "learning_rate": 0.0002, "epoch": 5.861000442673749, "step": 6620}, {"loss": 1.3301, "grad_norm": 0.9773725271224976, "learning_rate": 0.0002, "epoch": 5.869853917662683, "step": 6630}, {"loss": 1.2697, "grad_norm": 0.9684233665466309, "learning_rate": 0.0002, "epoch": 5.878707392651616, "step": 6640}, {"loss": 1.2866, "grad_norm": 0.8436519503593445, "learning_rate": 0.0002, "epoch": 5.887560867640548, "step": 6650}, {"loss": 1.2213, "grad_norm": 0.9129888415336609, "learning_rate": 0.0002, "epoch": 5.896414342629482, "step": 6660}, {"loss": 1.3272, "grad_norm": 0.8871369957923889, "learning_rate": 0.0002, "epoch": 5.905267817618415, "step": 6670}, {"loss": 1.3758, "grad_norm": 0.9544420838356018, "learning_rate": 0.0002, "epoch": 5.914121292607349, "step": 6680}, {"loss": 1.2954, "grad_norm": 0.9607448577880859, "learning_rate": 0.0002, "epoch": 5.922974767596282, "step": 6690}, {"loss": 1.2448, "grad_norm": 0.9675708413124084, "learning_rate": 0.0002, "epoch": 5.9318282425852145, "step": 6700}, {"loss": 1.3208, "grad_norm": 0.9373534321784973, "learning_rate": 0.0002, "epoch": 5.940681717574148, "step": 6710}, {"loss": 1.2982, "grad_norm": 0.9750351905822754, "learning_rate": 0.0002, "epoch": 5.949535192563081, "step": 6720}, {"loss": 1.2575, "grad_norm": 0.9122727513313293, "learning_rate": 0.0002, "epoch": 5.958388667552014, "step": 6730}, {"loss": 1.2259, "grad_norm": 0.9300726652145386, "learning_rate": 0.0002, "epoch": 5.967242142540948, "step": 6740}, {"loss": 1.2634, "grad_norm": 0.972944438457489, "learning_rate": 0.0002, "epoch": 5.9760956175298805, "step": 6750}, {"loss": 1.3252, "grad_norm": 1.2385832071304321, "learning_rate": 0.0002, "epoch": 5.984949092518813, "step": 6760}, {"loss": 1.2417, "grad_norm": 0.9080338478088379, "learning_rate": 0.0002, "epoch": 5.993802567507747, "step": 6770}, {"eval_loss": 2.062002658843994, "eval_runtime": 83.2814, "eval_samples_per_second": 6.184, "eval_steps_per_second": 0.78, "epoch": 6.0, "step": 6777}, {"loss": 1.2408, "grad_norm": 0.8741096258163452, "learning_rate": 0.0002, "epoch": 6.00265604249668, "step": 6780}, {"loss": 1.1242, "grad_norm": 1.2510347366333008, "learning_rate": 0.0002, "epoch": 6.011509517485613, "step": 6790}, {"loss": 1.0269, "grad_norm": 1.063910722732544, "learning_rate": 0.0002, "epoch": 6.0203629924745465, "step": 6800}, {"loss": 1.0468, "grad_norm": 1.169573187828064, "learning_rate": 0.0002, "epoch": 6.029216467463479, "step": 6810}, {"loss": 1.1221, "grad_norm": 1.0453242063522339, "learning_rate": 0.0002, "epoch": 6.038069942452412, "step": 6820}, {"loss": 1.0469, "grad_norm": 1.1960445642471313, "learning_rate": 0.0002, "epoch": 6.046923417441346, "step": 6830}, {"loss": 1.1233, "grad_norm": 0.9427650570869446, "learning_rate": 0.0002, "epoch": 6.055776892430279, "step": 6840}, {"loss": 1.0114, "grad_norm": 1.2107350826263428, "learning_rate": 0.0002, "epoch": 6.064630367419212, "step": 6850}, {"loss": 1.0751, "grad_norm": 1.262130856513977, "learning_rate": 0.0002, "epoch": 6.073483842408145, "step": 6860}, {"loss": 1.0787, "grad_norm": 1.1628082990646362, "learning_rate": 0.0002, "epoch": 6.082337317397078, "step": 6870}, {"loss": 1.0828, "grad_norm": 1.0090514421463013, "learning_rate": 0.0002, "epoch": 6.091190792386011, "step": 6880}, {"loss": 1.0718, "grad_norm": 1.5029802322387695, "learning_rate": 0.0002, "epoch": 6.100044267374945, "step": 6890}, {"loss": 1.0549, "grad_norm": 1.0522133111953735, "learning_rate": 0.0002, "epoch": 6.108897742363878, "step": 6900}, {"loss": 1.0502, "grad_norm": 1.225534439086914, "learning_rate": 0.0002, "epoch": 6.117751217352811, "step": 6910}, {"loss": 1.0808, "grad_norm": 1.2859058380126953, "learning_rate": 0.0002, "epoch": 6.126604692341744, "step": 6920}, {"loss": 1.1206, "grad_norm": 1.215205192565918, "learning_rate": 0.0002, "epoch": 6.135458167330677, "step": 6930}, {"loss": 1.1442, "grad_norm": 1.1799274682998657, "learning_rate": 0.0002, "epoch": 6.14431164231961, "step": 6940}, {"loss": 1.0749, "grad_norm": 1.2553550004959106, "learning_rate": 0.0002, "epoch": 6.153165117308544, "step": 6950}, {"loss": 1.1427, "grad_norm": 1.2171931266784668, "learning_rate": 0.0002, "epoch": 6.162018592297477, "step": 6960}, {"loss": 1.0579, "grad_norm": 1.1896923780441284, "learning_rate": 0.0002, "epoch": 6.17087206728641, "step": 6970}, {"loss": 1.1477, "grad_norm": 1.007250189781189, "learning_rate": 0.0002, "epoch": 6.179725542275343, "step": 6980}, {"loss": 1.1551, "grad_norm": 1.2109580039978027, "learning_rate": 0.0002, "epoch": 6.188579017264276, "step": 6990}, {"loss": 1.0809, "grad_norm": 1.2197009325027466, "learning_rate": 0.0002, "epoch": 6.19743249225321, "step": 7000}, {"loss": 1.1322, "grad_norm": 1.1417629718780518, "learning_rate": 0.0002, "epoch": 6.206285967242143, "step": 7010}, {"loss": 1.0541, "grad_norm": 1.2337356805801392, "learning_rate": 0.0002, "epoch": 6.2151394422310755, "step": 7020}, {"loss": 1.0195, "grad_norm": 1.1230454444885254, "learning_rate": 0.0002, "epoch": 6.223992917220009, "step": 7030}, {"loss": 1.1873, "grad_norm": 1.0634387731552124, "learning_rate": 0.0002, "epoch": 6.232846392208942, "step": 7040}, {"loss": 1.0892, "grad_norm": 1.1566855907440186, "learning_rate": 0.0002, "epoch": 6.241699867197875, "step": 7050}, {"loss": 1.063, "grad_norm": 1.2251075506210327, "learning_rate": 0.0002, "epoch": 6.250553342186809, "step": 7060}, {"loss": 1.1169, "grad_norm": 1.2232472896575928, "learning_rate": 0.0002, "epoch": 6.2594068171757415, "step": 7070}, {"loss": 1.0394, "grad_norm": 1.1014091968536377, "learning_rate": 0.0002, "epoch": 6.268260292164674, "step": 7080}, {"loss": 1.0627, "grad_norm": 1.322811245918274, "learning_rate": 0.0002, "epoch": 6.277113767153608, "step": 7090}, {"loss": 1.1108, "grad_norm": 0.9820072650909424, "learning_rate": 0.0002, "epoch": 6.285967242142541, "step": 7100}, {"loss": 1.0823, "grad_norm": 1.13047456741333, "learning_rate": 0.0002, "epoch": 6.294820717131474, "step": 7110}, {"loss": 1.1012, "grad_norm": 1.145127534866333, "learning_rate": 0.0002, "epoch": 6.303674192120408, "step": 7120}, {"loss": 1.089, "grad_norm": 1.101465106010437, "learning_rate": 0.0002, "epoch": 6.31252766710934, "step": 7130}, {"loss": 1.1122, "grad_norm": 1.131705641746521, "learning_rate": 0.0002, "epoch": 6.321381142098273, "step": 7140}, {"loss": 1.0173, "grad_norm": 0.9876824617385864, "learning_rate": 0.0002, "epoch": 6.330234617087207, "step": 7150}, {"loss": 1.0184, "grad_norm": 1.2950096130371094, "learning_rate": 0.0002, "epoch": 6.33908809207614, "step": 7160}, {"loss": 1.0559, "grad_norm": 1.0496132373809814, "learning_rate": 0.0002, "epoch": 6.347941567065073, "step": 7170}, {"loss": 1.1334, "grad_norm": 1.3835711479187012, "learning_rate": 0.0002, "epoch": 6.3567950420540065, "step": 7180}, {"loss": 0.9777, "grad_norm": 1.176424503326416, "learning_rate": 0.0002, "epoch": 6.365648517042939, "step": 7190}, {"loss": 1.1034, "grad_norm": 1.3502846956253052, "learning_rate": 0.0002, "epoch": 6.374501992031872, "step": 7200}, {"loss": 1.0614, "grad_norm": 1.2429769039154053, "learning_rate": 0.0002, "epoch": 6.383355467020806, "step": 7210}, {"loss": 1.1712, "grad_norm": 1.138015866279602, "learning_rate": 0.0002, "epoch": 6.392208942009739, "step": 7220}, {"loss": 1.1602, "grad_norm": 1.4407539367675781, "learning_rate": 0.0002, "epoch": 6.401062416998672, "step": 7230}, {"loss": 1.1595, "grad_norm": 1.1464104652404785, "learning_rate": 0.0002, "epoch": 6.409915891987605, "step": 7240}, {"loss": 1.1381, "grad_norm": 1.2028888463974, "learning_rate": 0.0002, "epoch": 6.418769366976538, "step": 7250}, {"loss": 1.1129, "grad_norm": 1.132938027381897, "learning_rate": 0.0002, "epoch": 6.427622841965471, "step": 7260}, {"loss": 1.0662, "grad_norm": 1.2005301713943481, "learning_rate": 0.0002, "epoch": 6.436476316954405, "step": 7270}, {"loss": 1.0538, "grad_norm": 1.0460501909255981, "learning_rate": 0.0002, "epoch": 6.445329791943338, "step": 7280}, {"loss": 1.0958, "grad_norm": 1.1363240480422974, "learning_rate": 0.0002, "epoch": 6.4541832669322705, "step": 7290}, {"loss": 1.1042, "grad_norm": 1.0439460277557373, "learning_rate": 0.0002, "epoch": 6.463036741921204, "step": 7300}, {"loss": 1.0896, "grad_norm": 1.1968905925750732, "learning_rate": 0.0002, "epoch": 6.471890216910137, "step": 7310}, {"loss": 1.0891, "grad_norm": 1.0443525314331055, "learning_rate": 0.0002, "epoch": 6.48074369189907, "step": 7320}, {"loss": 1.1384, "grad_norm": 1.2550246715545654, "learning_rate": 0.0002, "epoch": 6.489597166888004, "step": 7330}, {"loss": 1.2028, "grad_norm": 1.2880409955978394, "learning_rate": 0.0002, "epoch": 6.4984506418769366, "step": 7340}, {"loss": 1.1173, "grad_norm": 1.2390265464782715, "learning_rate": 0.0002, "epoch": 6.507304116865869, "step": 7350}, {"loss": 1.065, "grad_norm": 1.0650159120559692, "learning_rate": 0.0002, "epoch": 6.516157591854803, "step": 7360}, {"loss": 1.1072, "grad_norm": 1.4934154748916626, "learning_rate": 0.0002, "epoch": 6.525011066843736, "step": 7370}, {"loss": 1.0436, "grad_norm": 1.0902682542800903, "learning_rate": 0.0002, "epoch": 6.533864541832669, "step": 7380}, {"loss": 1.145, "grad_norm": 1.1561789512634277, "learning_rate": 0.0002, "epoch": 6.542718016821603, "step": 7390}, {"loss": 1.1633, "grad_norm": 1.1010485887527466, "learning_rate": 0.0002, "epoch": 6.551571491810535, "step": 7400}, {"loss": 1.1063, "grad_norm": 1.1616493463516235, "learning_rate": 0.0002, "epoch": 6.560424966799468, "step": 7410}, {"loss": 1.1217, "grad_norm": 1.2321627140045166, "learning_rate": 0.0002, "epoch": 6.569278441788402, "step": 7420}, {"loss": 1.135, "grad_norm": 1.162299394607544, "learning_rate": 0.0002, "epoch": 6.578131916777335, "step": 7430}, {"loss": 1.1785, "grad_norm": 0.9935213923454285, "learning_rate": 0.0002, "epoch": 6.586985391766269, "step": 7440}, {"loss": 1.078, "grad_norm": 1.3035451173782349, "learning_rate": 0.0002, "epoch": 6.5958388667552015, "step": 7450}, {"loss": 1.1377, "grad_norm": 1.0957173109054565, "learning_rate": 0.0002, "epoch": 6.604692341744134, "step": 7460}, {"loss": 1.1882, "grad_norm": 1.166472315788269, "learning_rate": 0.0002, "epoch": 6.613545816733068, "step": 7470}, {"loss": 1.1379, "grad_norm": 1.332716464996338, "learning_rate": 0.0002, "epoch": 6.622399291722001, "step": 7480}, {"loss": 1.1686, "grad_norm": 1.1008102893829346, "learning_rate": 0.0002, "epoch": 6.631252766710934, "step": 7490}, {"loss": 1.1644, "grad_norm": 1.4472310543060303, "learning_rate": 0.0002, "epoch": 6.6401062416998675, "step": 7500}, {"loss": 1.1729, "grad_norm": 1.1247508525848389, "learning_rate": 0.0002, "epoch": 6.6489597166888, "step": 7510}, {"loss": 1.1649, "grad_norm": 1.297936201095581, "learning_rate": 0.0002, "epoch": 6.657813191677733, "step": 7520}, {"loss": 1.1178, "grad_norm": 1.0784718990325928, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 7530}, {"loss": 1.0852, "grad_norm": 1.1518864631652832, "learning_rate": 0.0002, "epoch": 6.6755201416556, "step": 7540}, {"loss": 1.1611, "grad_norm": 1.1135684251785278, "learning_rate": 0.0002, "epoch": 6.684373616644533, "step": 7550}, {"loss": 1.1257, "grad_norm": 1.0792579650878906, "learning_rate": 0.0002, "epoch": 6.693227091633466, "step": 7560}, {"loss": 1.1466, "grad_norm": 1.1826539039611816, "learning_rate": 0.0002, "epoch": 6.702080566622399, "step": 7570}, {"loss": 1.0874, "grad_norm": 1.1485552787780762, "learning_rate": 0.0002, "epoch": 6.710934041611332, "step": 7580}, {"loss": 1.0502, "grad_norm": 1.090723991394043, "learning_rate": 0.0002, "epoch": 6.719787516600266, "step": 7590}, {"loss": 1.0627, "grad_norm": 1.105883002281189, "learning_rate": 0.0002, "epoch": 6.728640991589199, "step": 7600}, {"loss": 1.1101, "grad_norm": 1.3093862533569336, "learning_rate": 0.0002, "epoch": 6.737494466578132, "step": 7610}, {"loss": 1.1202, "grad_norm": 1.0273808240890503, "learning_rate": 0.0002, "epoch": 6.746347941567065, "step": 7620}, {"loss": 1.2071, "grad_norm": 1.3253363370895386, "learning_rate": 0.0002, "epoch": 6.755201416555998, "step": 7630}, {"loss": 1.0833, "grad_norm": 1.1979365348815918, "learning_rate": 0.0002, "epoch": 6.764054891544931, "step": 7640}, {"loss": 1.1208, "grad_norm": 1.123506784439087, "learning_rate": 0.0002, "epoch": 6.772908366533865, "step": 7650}, {"loss": 1.2111, "grad_norm": 1.3928422927856445, "learning_rate": 0.0002, "epoch": 6.781761841522798, "step": 7660}, {"loss": 1.1535, "grad_norm": 1.1540825366973877, "learning_rate": 0.0002, "epoch": 6.790615316511731, "step": 7670}, {"loss": 1.1053, "grad_norm": 1.0836732387542725, "learning_rate": 0.0002, "epoch": 6.799468791500664, "step": 7680}, {"loss": 1.1049, "grad_norm": 1.0360240936279297, "learning_rate": 0.0002, "epoch": 6.808322266489597, "step": 7690}, {"loss": 1.1819, "grad_norm": 1.2440129518508911, "learning_rate": 0.0002, "epoch": 6.817175741478531, "step": 7700}, {"loss": 1.1245, "grad_norm": 1.1702594757080078, "learning_rate": 0.0002, "epoch": 6.826029216467464, "step": 7710}, {"loss": 1.1021, "grad_norm": 1.0726280212402344, "learning_rate": 0.0002, "epoch": 6.8348826914563965, "step": 7720}, {"loss": 1.1471, "grad_norm": 0.9410907030105591, "learning_rate": 0.0002, "epoch": 6.84373616644533, "step": 7730}, {"loss": 1.1616, "grad_norm": 1.042914867401123, "learning_rate": 0.0002, "epoch": 6.852589641434263, "step": 7740}, {"loss": 1.215, "grad_norm": 1.1028170585632324, "learning_rate": 0.0002, "epoch": 6.861443116423196, "step": 7750}, {"loss": 1.0759, "grad_norm": 1.0990355014801025, "learning_rate": 0.0002, "epoch": 6.87029659141213, "step": 7760}, {"loss": 1.1508, "grad_norm": 1.2572479248046875, "learning_rate": 0.0002, "epoch": 6.8791500664010625, "step": 7770}, {"loss": 1.1749, "grad_norm": 1.250198483467102, "learning_rate": 0.0002, "epoch": 6.888003541389995, "step": 7780}, {"loss": 1.1299, "grad_norm": 1.1872532367706299, "learning_rate": 0.0002, "epoch": 6.896857016378929, "step": 7790}, {"loss": 1.129, "grad_norm": 1.5275602340698242, "learning_rate": 0.0002, "epoch": 6.905710491367862, "step": 7800}, {"loss": 1.0712, "grad_norm": 1.015166163444519, "learning_rate": 0.0002, "epoch": 6.914563966356795, "step": 7810}, {"loss": 1.1931, "grad_norm": 1.3205344676971436, "learning_rate": 0.0002, "epoch": 6.923417441345729, "step": 7820}, {"loss": 1.222, "grad_norm": 1.1329596042633057, "learning_rate": 0.0002, "epoch": 6.932270916334661, "step": 7830}, {"loss": 1.1207, "grad_norm": 1.1614333391189575, "learning_rate": 0.0002, "epoch": 6.941124391323594, "step": 7840}, {"loss": 1.2127, "grad_norm": 1.3472208976745605, "learning_rate": 0.0002, "epoch": 6.949977866312528, "step": 7850}, {"loss": 1.1002, "grad_norm": 1.1490193605422974, "learning_rate": 0.0002, "epoch": 6.958831341301461, "step": 7860}, {"loss": 1.1362, "grad_norm": 1.1343097686767578, "learning_rate": 0.0002, "epoch": 6.967684816290394, "step": 7870}, {"loss": 1.1622, "grad_norm": 1.2555341720581055, "learning_rate": 0.0002, "epoch": 6.9765382912793275, "step": 7880}, {"loss": 1.0955, "grad_norm": 1.2695735692977905, "learning_rate": 0.0002, "epoch": 6.98539176626826, "step": 7890}, {"loss": 1.1718, "grad_norm": 1.1662464141845703, "learning_rate": 0.0002, "epoch": 6.994245241257193, "step": 7900}, {"eval_loss": 2.148611068725586, "eval_runtime": 82.53, "eval_samples_per_second": 6.24, "eval_steps_per_second": 0.788, "epoch": 6.999557326250553, "step": 7906}, {"loss": 1.1038, "grad_norm": 1.0013059377670288, "learning_rate": 0.0002, "epoch": 7.003098716246127, "step": 7910}, {"loss": 0.962, "grad_norm": 1.317168951034546, "learning_rate": 0.0002, "epoch": 7.01195219123506, "step": 7920}, {"loss": 0.9373, "grad_norm": 1.2173038721084595, "learning_rate": 0.0002, "epoch": 7.020805666223993, "step": 7930}, {"loss": 0.9371, "grad_norm": 1.5555535554885864, "learning_rate": 0.0002, "epoch": 7.029659141212926, "step": 7940}, {"loss": 0.8791, "grad_norm": 1.1929986476898193, "learning_rate": 0.0002, "epoch": 7.038512616201859, "step": 7950}, {"loss": 0.9134, "grad_norm": 1.3552240133285522, "learning_rate": 0.0002, "epoch": 7.047366091190792, "step": 7960}, {"loss": 0.9813, "grad_norm": 1.3692620992660522, "learning_rate": 0.0002, "epoch": 7.056219566179726, "step": 7970}, {"loss": 0.9342, "grad_norm": 1.4173164367675781, "learning_rate": 0.0002, "epoch": 7.065073041168659, "step": 7980}, {"loss": 0.8799, "grad_norm": 1.2271063327789307, "learning_rate": 0.0002, "epoch": 7.0739265161575915, "step": 7990}, {"loss": 0.9586, "grad_norm": 1.4002584218978882, "learning_rate": 0.0002, "epoch": 7.082779991146525, "step": 8000}, {"loss": 0.9682, "grad_norm": 1.345386028289795, "learning_rate": 0.0002, "epoch": 7.091633466135458, "step": 8010}, {"loss": 0.9581, "grad_norm": 1.3328183889389038, "learning_rate": 0.0002, "epoch": 7.100486941124391, "step": 8020}, {"loss": 0.9408, "grad_norm": 1.1148749589920044, "learning_rate": 0.0002, "epoch": 7.109340416113325, "step": 8030}, {"loss": 0.8894, "grad_norm": 1.316633939743042, "learning_rate": 0.0002, "epoch": 7.1181938911022575, "step": 8040}, {"loss": 0.9547, "grad_norm": 1.2247374057769775, "learning_rate": 0.0002, "epoch": 7.12704736609119, "step": 8050}, {"loss": 0.9495, "grad_norm": 1.3124101161956787, "learning_rate": 0.0002, "epoch": 7.135900841080124, "step": 8060}, {"loss": 0.9922, "grad_norm": 1.3420861959457397, "learning_rate": 0.0002, "epoch": 7.144754316069057, "step": 8070}, {"loss": 0.9626, "grad_norm": 1.2799710035324097, "learning_rate": 0.0002, "epoch": 7.15360779105799, "step": 8080}, {"loss": 0.9021, "grad_norm": 1.3490463495254517, "learning_rate": 0.0002, "epoch": 7.162461266046924, "step": 8090}, {"loss": 1.0247, "grad_norm": 1.444670557975769, "learning_rate": 0.0002, "epoch": 7.171314741035856, "step": 8100}, {"loss": 0.8982, "grad_norm": 1.2264536619186401, "learning_rate": 0.0002, "epoch": 7.180168216024789, "step": 8110}, {"loss": 1.0122, "grad_norm": 1.2793710231781006, "learning_rate": 0.0002, "epoch": 7.189021691013723, "step": 8120}, {"loss": 0.9325, "grad_norm": 1.3160685300827026, "learning_rate": 0.0002, "epoch": 7.197875166002656, "step": 8130}, {"loss": 1.0383, "grad_norm": 1.289884090423584, "learning_rate": 0.0002, "epoch": 7.20672864099159, "step": 8140}, {"loss": 0.9422, "grad_norm": 1.6820887327194214, "learning_rate": 0.0002, "epoch": 7.2155821159805225, "step": 8150}, {"loss": 0.9301, "grad_norm": 1.403016209602356, "learning_rate": 0.0002, "epoch": 7.224435590969455, "step": 8160}, {"loss": 0.9361, "grad_norm": 1.3833755254745483, "learning_rate": 0.0002, "epoch": 7.233289065958389, "step": 8170}, {"loss": 0.9408, "grad_norm": 1.547101616859436, "learning_rate": 0.0002, "epoch": 7.242142540947322, "step": 8180}, {"loss": 0.9192, "grad_norm": 1.3376225233078003, "learning_rate": 0.0002, "epoch": 7.250996015936255, "step": 8190}, {"loss": 0.9351, "grad_norm": 1.3008460998535156, "learning_rate": 0.0002, "epoch": 7.2598494909251885, "step": 8200}, {"loss": 0.98, "grad_norm": 1.3364465236663818, "learning_rate": 0.0002, "epoch": 7.268702965914121, "step": 8210}, {"loss": 0.934, "grad_norm": 1.3967384099960327, "learning_rate": 0.0002, "epoch": 7.277556440903054, "step": 8220}, {"loss": 0.9587, "grad_norm": 1.538851022720337, "learning_rate": 0.0002, "epoch": 7.286409915891988, "step": 8230}, {"loss": 0.9856, "grad_norm": 1.6243304014205933, "learning_rate": 0.0002, "epoch": 7.295263390880921, "step": 8240}, {"loss": 0.9748, "grad_norm": 1.6250357627868652, "learning_rate": 0.0002, "epoch": 7.304116865869854, "step": 8250}, {"loss": 1.005, "grad_norm": 1.361752986907959, "learning_rate": 0.0002, "epoch": 7.312970340858787, "step": 8260}, {"loss": 0.883, "grad_norm": 1.4158686399459839, "learning_rate": 0.0002, "epoch": 7.32182381584772, "step": 8270}, {"loss": 0.9915, "grad_norm": 1.4000667333602905, "learning_rate": 0.0002, "epoch": 7.330677290836653, "step": 8280}, {"loss": 0.9323, "grad_norm": 1.293979287147522, "learning_rate": 0.0002, "epoch": 7.339530765825587, "step": 8290}, {"loss": 0.9544, "grad_norm": 1.3639771938323975, "learning_rate": 0.0002, "epoch": 7.34838424081452, "step": 8300}, {"loss": 0.9925, "grad_norm": 1.426788091659546, "learning_rate": 0.0002, "epoch": 7.3572377158034525, "step": 8310}, {"loss": 0.9162, "grad_norm": 1.3375388383865356, "learning_rate": 0.0002, "epoch": 7.366091190792386, "step": 8320}, {"loss": 1.032, "grad_norm": 1.2612264156341553, "learning_rate": 0.0002, "epoch": 7.374944665781319, "step": 8330}, {"loss": 0.9185, "grad_norm": 1.431223750114441, "learning_rate": 0.0002, "epoch": 7.383798140770252, "step": 8340}, {"loss": 1.0072, "grad_norm": 1.4454351663589478, "learning_rate": 0.0002, "epoch": 7.392651615759186, "step": 8350}, {"loss": 1.0231, "grad_norm": 1.3863321542739868, "learning_rate": 0.0002, "epoch": 7.401505090748119, "step": 8360}, {"loss": 1.0194, "grad_norm": 1.2186199426651, "learning_rate": 0.0002, "epoch": 7.410358565737051, "step": 8370}, {"loss": 1.0192, "grad_norm": 1.338301181793213, "learning_rate": 0.0002, "epoch": 7.419212040725985, "step": 8380}, {"loss": 0.999, "grad_norm": 1.4814497232437134, "learning_rate": 0.0002, "epoch": 7.428065515714918, "step": 8390}, {"loss": 0.9766, "grad_norm": 1.430943489074707, "learning_rate": 0.0002, "epoch": 7.436918990703851, "step": 8400}, {"loss": 1.0268, "grad_norm": 1.215942621231079, "learning_rate": 0.0002, "epoch": 7.445772465692785, "step": 8410}, {"loss": 0.9708, "grad_norm": 1.381890892982483, "learning_rate": 0.0002, "epoch": 7.4546259406817175, "step": 8420}, {"loss": 0.9153, "grad_norm": 1.390587568283081, "learning_rate": 0.0002, "epoch": 7.46347941567065, "step": 8430}, {"loss": 0.9696, "grad_norm": 1.6421098709106445, "learning_rate": 0.0002, "epoch": 7.472332890659584, "step": 8440}, {"loss": 1.032, "grad_norm": 1.43213951587677, "learning_rate": 0.0002, "epoch": 7.481186365648517, "step": 8450}, {"loss": 0.9545, "grad_norm": 1.3095251321792603, "learning_rate": 0.0002, "epoch": 7.490039840637451, "step": 8460}, {"loss": 1.0101, "grad_norm": 1.4996658563613892, "learning_rate": 0.0002, "epoch": 7.4988933156263835, "step": 8470}, {"loss": 0.9638, "grad_norm": 1.2955113649368286, "learning_rate": 0.0002, "epoch": 7.507746790615316, "step": 8480}, {"loss": 1.0388, "grad_norm": 1.3235514163970947, "learning_rate": 0.0002, "epoch": 7.51660026560425, "step": 8490}, {"loss": 1.014, "grad_norm": 1.408852219581604, "learning_rate": 0.0002, "epoch": 7.525453740593183, "step": 8500}, {"loss": 0.9258, "grad_norm": 1.4187248945236206, "learning_rate": 0.0002, "epoch": 7.534307215582116, "step": 8510}, {"loss": 0.9565, "grad_norm": 1.2473978996276855, "learning_rate": 0.0002, "epoch": 7.5431606905710495, "step": 8520}, {"loss": 1.001, "grad_norm": 1.2394654750823975, "learning_rate": 0.0002, "epoch": 7.552014165559982, "step": 8530}, {"loss": 0.9701, "grad_norm": 1.383175253868103, "learning_rate": 0.0002, "epoch": 7.560867640548915, "step": 8540}, {"loss": 0.9309, "grad_norm": 1.4113128185272217, "learning_rate": 0.0002, "epoch": 7.569721115537849, "step": 8550}, {"loss": 0.9908, "grad_norm": 1.4652873277664185, "learning_rate": 0.0002, "epoch": 7.578574590526782, "step": 8560}, {"loss": 1.0479, "grad_norm": 1.3373491764068604, "learning_rate": 0.0002, "epoch": 7.587428065515715, "step": 8570}, {"loss": 0.9994, "grad_norm": 1.2278908491134644, "learning_rate": 0.0002, "epoch": 7.596281540504648, "step": 8580}, {"loss": 0.9934, "grad_norm": 1.3615998029708862, "learning_rate": 0.0002, "epoch": 7.605135015493581, "step": 8590}, {"loss": 1.0324, "grad_norm": 1.5927653312683105, "learning_rate": 0.0002, "epoch": 7.613988490482514, "step": 8600}, {"loss": 1.0271, "grad_norm": 1.4127552509307861, "learning_rate": 0.0002, "epoch": 7.622841965471448, "step": 8610}, {"loss": 0.9713, "grad_norm": 1.276419997215271, "learning_rate": 0.0002, "epoch": 7.631695440460381, "step": 8620}, {"loss": 1.0321, "grad_norm": 1.3077269792556763, "learning_rate": 0.0002, "epoch": 7.640548915449314, "step": 8630}, {"loss": 1.0571, "grad_norm": 1.449960470199585, "learning_rate": 0.0002, "epoch": 7.649402390438247, "step": 8640}, {"loss": 1.0317, "grad_norm": 1.4388965368270874, "learning_rate": 0.0002, "epoch": 7.65825586542718, "step": 8650}, {"loss": 1.0161, "grad_norm": 1.4241976737976074, "learning_rate": 0.0002, "epoch": 7.667109340416113, "step": 8660}, {"loss": 1.0082, "grad_norm": 1.2062371969223022, "learning_rate": 0.0002, "epoch": 7.675962815405047, "step": 8670}, {"loss": 1.0034, "grad_norm": 1.288986325263977, "learning_rate": 0.0002, "epoch": 7.68481629039398, "step": 8680}, {"loss": 1.0254, "grad_norm": 1.3382292985916138, "learning_rate": 0.0002, "epoch": 7.6936697653829125, "step": 8690}, {"loss": 0.9996, "grad_norm": 1.2282090187072754, "learning_rate": 0.0002, "epoch": 7.702523240371846, "step": 8700}, {"loss": 1.0211, "grad_norm": 1.4728269577026367, "learning_rate": 0.0002, "epoch": 7.711376715360779, "step": 8710}, {"loss": 0.9809, "grad_norm": 1.0538904666900635, "learning_rate": 0.0002, "epoch": 7.720230190349712, "step": 8720}, {"loss": 0.947, "grad_norm": 1.3364583253860474, "learning_rate": 0.0002, "epoch": 7.729083665338646, "step": 8730}, {"loss": 0.9769, "grad_norm": 1.4484362602233887, "learning_rate": 0.0002, "epoch": 7.7379371403275785, "step": 8740}, {"loss": 0.9577, "grad_norm": 1.3406230211257935, "learning_rate": 0.0002, "epoch": 7.746790615316511, "step": 8750}, {"loss": 1.1238, "grad_norm": 1.3675546646118164, "learning_rate": 0.0002, "epoch": 7.755644090305445, "step": 8760}, {"loss": 0.9733, "grad_norm": 1.490721344947815, "learning_rate": 0.0002, "epoch": 7.764497565294378, "step": 8770}, {"loss": 0.9654, "grad_norm": 1.267425775527954, "learning_rate": 0.0002, "epoch": 7.773351040283311, "step": 8780}, {"loss": 0.9661, "grad_norm": 1.3113083839416504, "learning_rate": 0.0002, "epoch": 7.7822045152722445, "step": 8790}, {"loss": 1.0064, "grad_norm": 1.2262369394302368, "learning_rate": 0.0002, "epoch": 7.791057990261177, "step": 8800}, {"loss": 1.0915, "grad_norm": 1.2927134037017822, "learning_rate": 0.0002, "epoch": 7.79991146525011, "step": 8810}, {"loss": 1.0308, "grad_norm": 1.2576160430908203, "learning_rate": 0.0002, "epoch": 7.808764940239044, "step": 8820}, {"loss": 1.0077, "grad_norm": 1.3690781593322754, "learning_rate": 0.0002, "epoch": 7.817618415227977, "step": 8830}, {"loss": 1.0364, "grad_norm": 1.3828307390213013, "learning_rate": 0.0002, "epoch": 7.82647189021691, "step": 8840}, {"loss": 1.0492, "grad_norm": 1.4861878156661987, "learning_rate": 0.0002, "epoch": 7.835325365205843, "step": 8850}, {"loss": 1.062, "grad_norm": 1.403618335723877, "learning_rate": 0.0002, "epoch": 7.844178840194776, "step": 8860}, {"loss": 1.0392, "grad_norm": 1.4410181045532227, "learning_rate": 0.0002, "epoch": 7.853032315183709, "step": 8870}, {"loss": 0.9652, "grad_norm": 1.4488197565078735, "learning_rate": 0.0002, "epoch": 7.861885790172643, "step": 8880}, {"loss": 1.0167, "grad_norm": 1.6135752201080322, "learning_rate": 0.0002, "epoch": 7.870739265161576, "step": 8890}, {"loss": 1.0166, "grad_norm": 1.264705777168274, "learning_rate": 0.0002, "epoch": 7.879592740150509, "step": 8900}, {"loss": 1.0288, "grad_norm": 1.308629035949707, "learning_rate": 0.0002, "epoch": 7.888446215139442, "step": 8910}, {"loss": 1.0195, "grad_norm": 1.3849096298217773, "learning_rate": 0.0002, "epoch": 7.897299690128375, "step": 8920}, {"loss": 1.0059, "grad_norm": 1.4319216012954712, "learning_rate": 0.0002, "epoch": 7.906153165117309, "step": 8930}, {"loss": 0.9961, "grad_norm": 1.2494885921478271, "learning_rate": 0.0002, "epoch": 7.915006640106242, "step": 8940}, {"loss": 0.9895, "grad_norm": 1.4066457748413086, "learning_rate": 0.0002, "epoch": 7.923860115095175, "step": 8950}, {"loss": 1.0867, "grad_norm": 1.285872459411621, "learning_rate": 0.0002, "epoch": 7.932713590084108, "step": 8960}, {"loss": 1.0228, "grad_norm": 1.2378270626068115, "learning_rate": 0.0002, "epoch": 7.941567065073041, "step": 8970}, {"loss": 1.0107, "grad_norm": 1.547827124595642, "learning_rate": 0.0002, "epoch": 7.950420540061974, "step": 8980}, {"loss": 1.0742, "grad_norm": 1.539252519607544, "learning_rate": 0.0002, "epoch": 7.959274015050908, "step": 8990}, {"loss": 1.0258, "grad_norm": 1.230036973953247, "learning_rate": 0.0002, "epoch": 7.968127490039841, "step": 9000}, {"loss": 1.0198, "grad_norm": 1.4130570888519287, "learning_rate": 0.0002, "epoch": 7.9769809650287735, "step": 9010}, {"loss": 1.0025, "grad_norm": 1.4037895202636719, "learning_rate": 0.0002, "epoch": 7.985834440017707, "step": 9020}, {"loss": 0.9551, "grad_norm": 1.4847569465637207, "learning_rate": 0.0002, "epoch": 7.99468791500664, "step": 9030}]}